[llvm] [GlobalIsel] Combine G_UNMERGE_VALUES from opaque vectors into scalars (PR #113040)

Thorsten Schütt via llvm-commits llvm-commits at lists.llvm.org
Sat Oct 19 09:53:29 PDT 2024


https://github.com/tschuett updated https://github.com/llvm/llvm-project/pull/113040

>From d08de498d8b234d75e791665e28f7811fb499d27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Sat, 19 Oct 2024 15:58:45 +0200
Subject: [PATCH 1/2] [GlobalIsel] Combine G_UNMERGE_VALUES from opaque vectors
 into scalars

%opaque:_(<2 x s64>) = G_OPAQUE
%un1:_(s64), %un2:_(s64) = G_UNMERGE_VALUES %opaque(<2 x s64>)

->

%zero:_(s64) = G_CONSTANT i64 0
%one:_(s64) = G_CONSTANT i64 1
%un1:_(s64) = G_EXTRACT_VECTOR_ELT %opaque, $zero
%un2:_(s64) = G_EXTRACT_VECTOR_ELT %opaque, $one

unable to legalize instruction: %5:_(s128) = G_EXTRACT_VECTOR_ELT %3:_(<2 x s128>), %7:_(s64) (in function: fabs_v2f128)

 Test:
    llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |    4 +
 .../include/llvm/Target/GlobalISel/Combine.td |   11 +-
 llvm/lib/CodeGen/GlobalISel/CMakeLists.txt    |    1 +
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |   82 -
 .../GlobalISel/CombinerHelperArtifacts.cpp    |  169 ++
 llvm/lib/Target/AArch64/AArch64Combine.td     |    2 +-
 .../AArch64/GlobalISel/combine-unmerge.mir    |  104 +-
 llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll  |  146 +-
 llvm/test/CodeGen/AArch64/abs.ll              |  152 +-
 llvm/test/CodeGen/AArch64/add.ll              |   25 +-
 llvm/test/CodeGen/AArch64/andorxor.ll         |   75 +-
 llvm/test/CodeGen/AArch64/arm64-fp128.ll      |    7 +-
 llvm/test/CodeGen/AArch64/bitcast.ll          |   25 +-
 llvm/test/CodeGen/AArch64/bswap.ll            |   74 +-
 llvm/test/CodeGen/AArch64/fabs.ll             |   54 +-
 llvm/test/CodeGen/AArch64/faddsub.ll          |  178 +-
 llvm/test/CodeGen/AArch64/fcmp.ll             |  405 +--
 llvm/test/CodeGen/AArch64/fcopysign.ll        |   85 +-
 llvm/test/CodeGen/AArch64/fcvt.ll             |  511 +++-
 llvm/test/CodeGen/AArch64/fdiv.ll             |   89 +-
 llvm/test/CodeGen/AArch64/fexplog.ll          |  230 +-
 .../AArch64/fixed-vector-deinterleave.ll      |    8 +-
 llvm/test/CodeGen/AArch64/fminimummaximum.ll  |  178 +-
 llvm/test/CodeGen/AArch64/fminmax.ll          |  178 +-
 llvm/test/CodeGen/AArch64/fmla.ll             |  320 ++-
 llvm/test/CodeGen/AArch64/fmul.ll             |   89 +-
 llvm/test/CodeGen/AArch64/fneg.ll             |   52 +-
 llvm/test/CodeGen/AArch64/fpext.ll            |   39 +-
 llvm/test/CodeGen/AArch64/fpow.ll             |   58 +-
 llvm/test/CodeGen/AArch64/fpowi.ll            |   44 +-
 llvm/test/CodeGen/AArch64/fptoi.ll            |  518 +++-
 .../test/CodeGen/AArch64/fptosi-sat-vector.ll |  217 +-
 .../test/CodeGen/AArch64/fptoui-sat-vector.ll |  194 +-
 llvm/test/CodeGen/AArch64/fptrunc.ll          |   58 +-
 llvm/test/CodeGen/AArch64/frem.ll             |   58 +-
 llvm/test/CodeGen/AArch64/fsincos.ll          |   92 +-
 llvm/test/CodeGen/AArch64/fsqrt.ll            |   69 +-
 llvm/test/CodeGen/AArch64/icmp.ll             |   56 +-
 llvm/test/CodeGen/AArch64/insertextract.ll    |   30 +-
 llvm/test/CodeGen/AArch64/itofp.ll            |  714 ++++--
 llvm/test/CodeGen/AArch64/llvm.exp10.ll       |   81 +-
 llvm/test/CodeGen/AArch64/load.ll             |  104 +-
 llvm/test/CodeGen/AArch64/mul.ll              |   29 +-
 llvm/test/CodeGen/AArch64/rem.ll              |  126 +-
 llvm/test/CodeGen/AArch64/sext.ll             |  118 +-
 llvm/test/CodeGen/AArch64/shift.ll            |  564 ++++-
 llvm/test/CodeGen/AArch64/shufflevector.ll    |  289 ++-
 llvm/test/CodeGen/AArch64/sub.ll              |   25 +-
 .../AArch64/vecreduce-umax-legalization.ll    |   22 +-
 llvm/test/CodeGen/AArch64/xtn.ll              |   17 +-
 llvm/test/CodeGen/AArch64/zext.ll             |   98 +-
 .../CodeGen/AMDGPU/GlobalISel/add.v2i16.ll    |   57 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll   |   14 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll  |    9 +
 .../combine-fma-add-mul-pre-legalize.mir      |  240 +-
 .../GlobalISel/combine-fma-unmerge-values.mir |   60 +-
 .../CodeGen/AMDGPU/GlobalISel/dummy-target.ll |   16 +-
 .../CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll     |  382 +--
 llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll    |   82 +-
 .../CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll   |   12 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll   |   98 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll   |   20 +
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll   |   90 +-
 .../legalize-llvm.amdgcn.image.dim.a16.ll     | 1098 ++++++---
 .../legalize-llvm.amdgcn.image.load.2d.d16.ll |   94 +-
 .../legalize-llvm.amdgcn.image.load.2d.ll     |  228 +-
 ...lize-llvm.amdgcn.image.load.2darraymsaa.ll |   48 +-
 .../legalize-llvm.amdgcn.image.sample.a16.ll  | 2184 +++++++++++------
 .../legalize-llvm.amdgcn.image.sample.d.ll    |  144 +-
 ...galize-llvm.amdgcn.image.sample.g16.a16.ll |  108 +-
 .../legalize-llvm.amdgcn.image.sample.g16.ll  |  630 +++--
 ...legalize-llvm.amdgcn.image.store.2d.d16.ll |  135 +-
 .../CodeGen/AMDGPU/GlobalISel/llvm.abs.ll     |   14 +-
 .../load-legalize-range-metadata.ll           |   16 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll   |    8 +-
 .../regbankselect-amdgcn.s.buffer.load.ll     |   28 +-
 .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll |   17 +-
 .../CodeGen/AMDGPU/GlobalISel/sext_inreg.ll   |   14 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll    |   10 +-
 .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll |   17 +-
 .../test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll |    3 +-
 .../test/CodeGen/AMDGPU/GlobalISel/usubsat.ll |    3 +-
 .../CodeGen/AMDGPU/integer-mad-patterns.ll    |   25 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp.ll          |   71 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp10.ll        |   71 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp2.ll         |   45 +-
 llvm/test/CodeGen/AMDGPU/llvm.frexp.ll        |   10 +-
 llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll        |   42 +-
 llvm/test/CodeGen/AMDGPU/llvm.log.ll          |  111 +-
 llvm/test/CodeGen/AMDGPU/llvm.log10.ll        |  111 +-
 llvm/test/CodeGen/AMDGPU/llvm.log2.ll         |   81 +-
 llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll        |  167 +-
 llvm/test/CodeGen/AMDGPU/roundeven.ll         |   84 +-
 llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll |    6 +-
 94 files changed, 9544 insertions(+), 4033 deletions(-)
 create mode 100644 llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 9240a3c3127eb4..87409c88788e6a 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -922,6 +922,10 @@ class CombinerHelper {
   bool matchUnmergeValuesAnyExtBuildVector(const MachineInstr &MI,
                                            BuildFnTy &MatchInfo);
 
+  // unmerge_values(opaque vector) -> extract vector elt
+  bool matchUnmergeValuesOfScalarAndVector(const MachineInstr &MI,
+                                           BuildFnTy &MatchInfo);
+
 private:
   /// Checks for legality of an indexed variant of \p LdSt.
   bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index ead4149fc11068..39dd58837d5750 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -840,6 +840,14 @@ def unmerge_anyext_build_vector : GICombineRule<
   (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])
 >;
 
+// Transform unmerge opaque vector -> extract vector elt
+def unmerge_opaque_vector : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_UNMERGE_VALUES): $root,
+  [{ return Helper.matchUnmergeValuesOfScalarAndVector(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])
+>;
+
 // Transform x,y = unmerge(zext(z)) -> x = zext z; y = 0.
 def unmerge_zext_to_zext : GICombineRule<
   (defs root:$d),
@@ -855,7 +863,8 @@ def merge_combines: GICombineGroup<[
   unmerge_cst,
   unmerge_undef,
   unmerge_dead_to_trunc,
-  unmerge_zext_to_zext
+  unmerge_zext_to_zext,
+  unmerge_opaque_vector
 ]>;
 
 // Under certain conditions, transform:
diff --git a/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt b/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
index af1717dbf76f39..a45024d120be68 100644
--- a/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
+++ b/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_component_library(LLVMGlobalISel
   GlobalISel.cpp
   Combiner.cpp
   CombinerHelper.cpp
+  CombinerHelperArtifacts.cpp
   CombinerHelperCasts.cpp
   CombinerHelperCompares.cpp
   CombinerHelperVectorOps.cpp
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index b7ddf9f479ef8e..f9b1621955c217 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -7611,85 +7611,3 @@ bool CombinerHelper::matchFoldAMinusC1PlusC2(const MachineInstr &MI,
 
   return true;
 }
-
-bool CombinerHelper::matchUnmergeValuesAnyExtBuildVector(const MachineInstr &MI,
-                                                         BuildFnTy &MatchInfo) {
-  const GUnmerge *Unmerge = cast<GUnmerge>(&MI);
-
-  if (!MRI.hasOneNonDBGUse(Unmerge->getSourceReg()))
-    return false;
-
-  const MachineInstr *Source = MRI.getVRegDef(Unmerge->getSourceReg());
-
-  LLT DstTy = MRI.getType(Unmerge->getReg(0));
-
-  // $bv:_(<8 x s8>) = G_BUILD_VECTOR ....
-  // $any:_(<8 x s16>) = G_ANYEXT $bv
-  // $uv:_(<4 x s16>), $uv1:_(<4 x s16>) = G_UNMERGE_VALUES $any
-  //
-  // ->
-  //
-  // $any:_(s16) = G_ANYEXT $bv[0]
-  // $any1:_(s16) = G_ANYEXT $bv[1]
-  // $any2:_(s16) = G_ANYEXT $bv[2]
-  // $any3:_(s16) = G_ANYEXT $bv[3]
-  // $any4:_(s16) = G_ANYEXT $bv[4]
-  // $any5:_(s16) = G_ANYEXT $bv[5]
-  // $any6:_(s16) = G_ANYEXT $bv[6]
-  // $any7:_(s16) = G_ANYEXT $bv[7]
-  // $uv:_(<4 x s16>) = G_BUILD_VECTOR $any, $any1, $any2, $any3
-  // $uv1:_(<4 x s16>) = G_BUILD_VECTOR $any4, $any5, $any6, $any7
-
-  // We want to unmerge into vectors.
-  if (!DstTy.isFixedVector())
-    return false;
-
-  const GAnyExt *Any = dyn_cast<GAnyExt>(Source);
-  if (!Any)
-    return false;
-
-  const MachineInstr *NextSource = MRI.getVRegDef(Any->getSrcReg());
-
-  if (const GBuildVector *BV = dyn_cast<GBuildVector>(NextSource)) {
-    // G_UNMERGE_VALUES G_ANYEXT G_BUILD_VECTOR
-
-    if (!MRI.hasOneNonDBGUse(BV->getReg(0)))
-      return false;
-
-    // FIXME: check element types?
-    if (BV->getNumSources() % Unmerge->getNumDefs() != 0)
-      return false;
-
-    LLT BigBvTy = MRI.getType(BV->getReg(0));
-    LLT SmallBvTy = DstTy;
-    LLT SmallBvElemenTy = SmallBvTy.getElementType();
-
-    if (!isLegalOrBeforeLegalizer(
-            {TargetOpcode::G_BUILD_VECTOR, {SmallBvTy, SmallBvElemenTy}}))
-      return false;
-
-    // We check the legality of scalar anyext.
-    if (!isLegalOrBeforeLegalizer(
-            {TargetOpcode::G_ANYEXT,
-             {SmallBvElemenTy, BigBvTy.getElementType()}}))
-      return false;
-
-    MatchInfo = [=](MachineIRBuilder &B) {
-      // Build into each G_UNMERGE_VALUES def
-      // a small build vector with anyext from the source build vector.
-      for (unsigned I = 0; I < Unmerge->getNumDefs(); ++I) {
-        SmallVector<Register> Ops;
-        for (unsigned J = 0; J < SmallBvTy.getNumElements(); ++J) {
-          Register SourceArray =
-              BV->getSourceReg(I * SmallBvTy.getNumElements() + J);
-          auto AnyExt = B.buildAnyExt(SmallBvElemenTy, SourceArray);
-          Ops.push_back(AnyExt.getReg(0));
-        }
-        B.buildBuildVector(Unmerge->getOperand(I).getReg(), Ops);
-      };
-    };
-    return true;
-  };
-
-  return false;
-}
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
new file mode 100644
index 00000000000000..805d34ae0493c4
--- /dev/null
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
@@ -0,0 +1,169 @@
+//===- CombinerHelperArtifacts.cpp-----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements CombinerHelper for legalization artifacts.
+//
+//===----------------------------------------------------------------------===//
+//
+// G_UNMERGE_VALUES
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/LowLevelTypeUtils.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/Support/Casting.h"
+
+#define DEBUG_TYPE "gi-combiner"
+
+using namespace llvm;
+
+bool CombinerHelper::matchUnmergeValuesAnyExtBuildVector(const MachineInstr &MI,
+                                                         BuildFnTy &MatchInfo) {
+  const GUnmerge *Unmerge = cast<GUnmerge>(&MI);
+
+  if (!MRI.hasOneNonDBGUse(Unmerge->getSourceReg()))
+    return false;
+
+  const MachineInstr *Source = MRI.getVRegDef(Unmerge->getSourceReg());
+
+  LLT DstTy = MRI.getType(Unmerge->getReg(0));
+
+  // $bv:_(<8 x s8>) = G_BUILD_VECTOR ....
+  // $any:_(<8 x s16>) = G_ANYEXT $bv
+  // $uv:_(<4 x s16>), $uv1:_(<4 x s16>) = G_UNMERGE_VALUES $any
+  //
+  // ->
+  //
+  // $any:_(s16) = G_ANYEXT $bv[0]
+  // $any1:_(s16) = G_ANYEXT $bv[1]
+  // $any2:_(s16) = G_ANYEXT $bv[2]
+  // $any3:_(s16) = G_ANYEXT $bv[3]
+  // $any4:_(s16) = G_ANYEXT $bv[4]
+  // $any5:_(s16) = G_ANYEXT $bv[5]
+  // $any6:_(s16) = G_ANYEXT $bv[6]
+  // $any7:_(s16) = G_ANYEXT $bv[7]
+  // $uv:_(<4 x s16>) = G_BUILD_VECTOR $any, $any1, $any2, $any3
+  // $uv1:_(<4 x s16>) = G_BUILD_VECTOR $any4, $any5, $any6, $any7
+
+  // We want to unmerge into vectors.
+  if (!DstTy.isFixedVector())
+    return false;
+
+  const GAnyExt *Any = dyn_cast<GAnyExt>(Source);
+  if (!Any)
+    return false;
+
+  const MachineInstr *NextSource = MRI.getVRegDef(Any->getSrcReg());
+
+  if (const GBuildVector *BV = dyn_cast<GBuildVector>(NextSource)) {
+    // G_UNMERGE_VALUES G_ANYEXT G_BUILD_VECTOR
+
+    if (!MRI.hasOneNonDBGUse(BV->getReg(0)))
+      return false;
+
+    // FIXME: check element types?
+    if (BV->getNumSources() % Unmerge->getNumDefs() != 0)
+      return false;
+
+    LLT BigBvTy = MRI.getType(BV->getReg(0));
+    LLT SmallBvTy = DstTy;
+    LLT SmallBvElemenTy = SmallBvTy.getElementType();
+
+    if (!isLegalOrBeforeLegalizer(
+            {TargetOpcode::G_BUILD_VECTOR, {SmallBvTy, SmallBvElemenTy}}))
+      return false;
+
+    // We check the legality of scalar anyext.
+    if (!isLegalOrBeforeLegalizer(
+            {TargetOpcode::G_ANYEXT,
+             {SmallBvElemenTy, BigBvTy.getElementType()}}))
+      return false;
+
+    MatchInfo = [=](MachineIRBuilder &B) {
+      // Build into each G_UNMERGE_VALUES def
+      // a small build vector with anyext from the source build vector.
+      for (unsigned I = 0; I < Unmerge->getNumDefs(); ++I) {
+        SmallVector<Register> Ops;
+        for (unsigned J = 0; J < SmallBvTy.getNumElements(); ++J) {
+          Register SourceArray =
+              BV->getSourceReg(I * SmallBvTy.getNumElements() + J);
+          auto AnyExt = B.buildAnyExt(SmallBvElemenTy, SourceArray);
+          Ops.push_back(AnyExt.getReg(0));
+        }
+        B.buildBuildVector(Unmerge->getOperand(I).getReg(), Ops);
+      };
+    };
+    return true;
+  };
+
+  return false;
+}
+
+bool CombinerHelper::matchUnmergeValuesOfScalarAndVector(const MachineInstr &MI,
+                                                         BuildFnTy &MatchInfo) {
+
+  constexpr unsigned MAX_NUM_DEFS_LIMIT = 8;
+
+  //  %opaque:_(<2 x s64>) = G_OPAQUE
+  //  %un1:_(s64), %un2:_(s64) = G_UNMERGE_VALUES %opaque(<2 x s64>)
+  //
+  //  ->
+  //
+  //  %zero:_(s64) = G_CONSTANT i64 0
+  //  %one:_(s64) = G_CONSTANT i64 1
+  //  %un1:_(s64) = G_EXTRACT_VECTOR_ELT %opaque, $zero
+  //  %un2:_(s64) = G_EXTRACT_VECTOR_ELT %opaque, $one
+
+  const GUnmerge *Unmerge = cast<GUnmerge>(&MI);
+
+  if (Unmerge->getNumDefs() > MAX_NUM_DEFS_LIMIT)
+    return false;
+
+  LLT DstTy = MRI.getType(Unmerge->getReg(0));
+  LLT SrcTy = MRI.getType(Unmerge->getSourceReg());
+
+  // We want to unmerge a vector into scalars.
+  if (!DstTy.isScalar() || !SrcTy.isFixedVector() || DstTy.getSizeInBits() > 64)
+    return false;
+
+  if (DstTy != SrcTy.getElementType())
+    return false;
+
+  // We want to unmerge from an opaque vector.
+  const MachineInstr *Source = MRI.getVRegDef(Unmerge->getSourceReg());
+  if (isa<GBuildVector>(Source))
+    return false;
+
+  unsigned PreferredVecIdxWidth =
+      getTargetLowering().getVectorIdxTy(getDataLayout()).getSizeInBits();
+
+  LLT IdxTy = LLT::scalar(PreferredVecIdxWidth);
+
+  if (!isLegalOrBeforeLegalizer(
+          {TargetOpcode::G_EXTRACT_VECTOR_ELT, {DstTy, SrcTy, IdxTy}}))
+    return false;
+
+  if (!isConstantLegalOrBeforeLegalizer(IdxTy))
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &B) {
+    for (unsigned I = 0; I < Unmerge->getNumDefs(); ++I) {
+      auto Index = B.buildConstant(IdxTy, I);
+      B.buildExtractVectorElement(Unmerge->getOperand(I).getReg(),
+                                  Unmerge->getSourceReg(), Index);
+    }
+  };
+
+  return true;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 8af8cdfeba6ac4..1eb7488e4ff570 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -322,7 +322,7 @@ def AArch64PostLegalizerCombiner
                         extractvecelt_pairwise_add, redundant_or,
                         mul_const, redundant_sext_inreg,
                         form_bitfield_extract, rotate_out_of_range,
-                        icmp_to_true_false_known_bits,
+                        icmp_to_true_false_known_bits, vector_ops_combines,
                         select_combines, fold_merge_to_zext,
                         constant_fold_binops, identity_combines,
                         ptr_add_immed_chain, overlapping_and,
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
index 7566d38e6c6cfa..fc7584a2e1b162 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
@@ -422,9 +422,12 @@ body:             |
     ; CHECK-LABEL: name: test_dont_combine_unmerge_zext_to_zext_src_vector
     ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $w0
     ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<2 x s32>) = G_ZEXT [[COPY]](<2 x s16>)
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ZEXT]](<2 x s32>)
-    ; CHECK-NEXT: $w0 = COPY [[UV]](s32)
-    ; CHECK-NEXT: $w1 = COPY [[UV1]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[ZEXT]](<2 x s32>), [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[ZEXT]](<2 x s32>), [[C1]](s64)
+    ; CHECK-NEXT: $w0 = COPY [[EVEC]](s32)
+    ; CHECK-NEXT: $w1 = COPY [[EVEC1]](s32)
     %0:_(<2 x s16>) = COPY $w0
     %3:_(<2 x s32>) = G_ZEXT %0(<2 x s16>)
     %1:_(s32),%2:_(s32) = G_UNMERGE_VALUES %3(<2 x s32>)
@@ -539,3 +542,98 @@ body:             |
     $q0 = COPY %un1(s128)
     $q1 = COPY %un2(s128)
 ...
+
+# Check that we unmerge the opaque vector into extract vector elt
+---
+name:            test_opaque_vector_scalar
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_opaque_vector_scalar
+    ; CHECK: %opaque:_(<2 x s64>) = COPY $q0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: %un1:_(s64) = G_EXTRACT_VECTOR_ELT %opaque(<2 x s64>), [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: %un2:_(s64) = G_EXTRACT_VECTOR_ELT %opaque(<2 x s64>), [[C1]](s64)
+    ; CHECK-NEXT: $x0 = COPY %un1(s64)
+    ; CHECK-NEXT: $x1 = COPY %un2(s64)
+    %opaque:_(<2 x s64>) = COPY $q0
+    %un1:_(s64), %un2:_(s64) = G_UNMERGE_VALUES %opaque(<2 x s64>)
+    $x0 = COPY %un1(s64)
+    $x1 = COPY %un2(s64)
+...
+
+# Check that we don't unmerge the opaque vector into scalars
+---
+name:            test_opaque_vector_vector
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_opaque_vector_vector
+    ; CHECK: %opaque:_(s128) = COPY $q0
+    ; CHECK-NEXT: %un1:_(s64), %un2:_(s64) = G_UNMERGE_VALUES %opaque(s128)
+    ; CHECK-NEXT: $x0 = COPY %un1(s64)
+    ; CHECK-NEXT: $x1 = COPY %un2(s64)
+    %opaque:_(s128) = COPY $q0
+    %un1:_(s64), %un2:_(s64) = G_UNMERGE_VALUES %opaque(s128)
+    $x0 = COPY %un1(s64)
+    $x1 = COPY %un2(s64)
+...
+
+# Check that we unmerge the long opaque vector into extract vector elt
+---
+name:            test_long_opaque_vector_scalar
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_long_opaque_vector_scalar
+    ; CHECK: %opaque:_(<8 x s16>) = COPY $q0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: %un1:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: %un2:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C1]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: %un3:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C2]](s64)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; CHECK-NEXT: %un4:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C3]](s64)
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: %un5:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C4]](s64)
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
+    ; CHECK-NEXT: %un6:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C5]](s64)
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; CHECK-NEXT: %un7:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C6]](s64)
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 7
+    ; CHECK-NEXT: %un8:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C7]](s64)
+    ; CHECK-NEXT: %zext1:_(s32) = G_ZEXT %un1(s16)
+    ; CHECK-NEXT: %zext2:_(s32) = G_ZEXT %un2(s16)
+    ; CHECK-NEXT: %zext3:_(s32) = G_ZEXT %un3(s16)
+    ; CHECK-NEXT: %zext4:_(s32) = G_ZEXT %un4(s16)
+    ; CHECK-NEXT: %zext5:_(s32) = G_ZEXT %un5(s16)
+    ; CHECK-NEXT: %zext6:_(s32) = G_ZEXT %un6(s16)
+    ; CHECK-NEXT: %zext7:_(s32) = G_ZEXT %un7(s16)
+    ; CHECK-NEXT: %zext8:_(s32) = G_ZEXT %un8(s16)
+    ; CHECK-NEXT: $w0 = COPY %zext1(s32)
+    ; CHECK-NEXT: $w1 = COPY %zext2(s32)
+    ; CHECK-NEXT: $w0 = COPY %zext3(s32)
+    ; CHECK-NEXT: $w1 = COPY %zext4(s32)
+    ; CHECK-NEXT: $w0 = COPY %zext5(s32)
+    ; CHECK-NEXT: $w1 = COPY %zext6(s32)
+    ; CHECK-NEXT: $w0 = COPY %zext7(s32)
+    ; CHECK-NEXT: $w1 = COPY %zext8(s32)
+    %opaque:_(<8 x s16>) = COPY $q0
+    %un1:_(s16), %un2:_(s16), %un3:_(s16), %un4:_(s16), %un5:_(s16), %un6:_(s16), %un7:_(s16), %un8:_(s16) = G_UNMERGE_VALUES %opaque(<8 x s16>)
+    %zext1:_(s32) = G_ZEXT %un1
+    %zext2:_(s32) = G_ZEXT %un2
+    %zext3:_(s32) = G_ZEXT %un3
+    %zext4:_(s32) = G_ZEXT %un4
+    %zext5:_(s32) = G_ZEXT %un5
+    %zext6:_(s32) = G_ZEXT %un6
+    %zext7:_(s32) = G_ZEXT %un7
+    %zext8:_(s32) = G_ZEXT %un8
+    $w0 = COPY %zext1(s32)
+    $w1 = COPY %zext2(s32)
+    $w0 = COPY %zext3(s32)
+    $w1 = COPY %zext4(s32)
+    $w0 = COPY %zext5(s32)
+    $w1 = COPY %zext6(s32)
+    $w0 = COPY %zext7(s32)
+    $w1 = COPY %zext8(s32)
+...
+
diff --git a/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll b/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
index f7aa57a068a4ce..4d75367fa06b49 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
@@ -590,14 +590,26 @@ entry:
 }
 
 define i16 @sminv_v3i16(<3 x i16> %a) {
-; CHECK-LABEL: sminv_v3i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    mov v0.h[3], w8
-; CHECK-NEXT:    sminv h0, v0.4h
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sminv_v3i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-SD-NEXT:    mov v0.h[3], w8
+; CHECK-SD-NEXT:    sminv h0, v0.4h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sminv_v3i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], w8
+; CHECK-GI-NEXT:    sminv h0, v1.4h
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = call i16 @llvm.vector.reduce.smin.v3i16(<3 x i16> %a)
   ret i16 %arg1
@@ -649,13 +661,24 @@ entry:
 }
 
 define i32 @sminv_v3i32(<3 x i32> %a) {
-; CHECK-LABEL: sminv_v3i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #2147483647 // =0x7fffffff
-; CHECK-NEXT:    mov v0.s[3], w8
-; CHECK-NEXT:    sminv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sminv_v3i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #2147483647 // =0x7fffffff
+; CHECK-SD-NEXT:    mov v0.s[3], w8
+; CHECK-SD-NEXT:    sminv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sminv_v3i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov w8, #2147483647 // =0x7fffffff
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v1.s[3], w8
+; CHECK-GI-NEXT:    sminv s0, v1.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = call i32 @llvm.vector.reduce.smin.v3i32(<3 x i32> %a)
   ret i32 %arg1
@@ -954,9 +977,12 @@ define i16 @smaxv_v3i16(<3 x i16> %a) {
 ; CHECK-GI-LABEL: smaxv_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
 ; CHECK-GI-NEXT:    mov w8, #32768 // =0x8000
-; CHECK-GI-NEXT:    mov v0.h[3], w8
-; CHECK-GI-NEXT:    smaxv h0, v0.4h
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], w8
+; CHECK-GI-NEXT:    smaxv h0, v1.4h
 ; CHECK-GI-NEXT:    fmov w0, s0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1010,13 +1036,24 @@ entry:
 }
 
 define i32 @smaxv_v3i32(<3 x i32> %a) {
-; CHECK-LABEL: smaxv_v3i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #-2147483648 // =0x80000000
-; CHECK-NEXT:    mov v0.s[3], w8
-; CHECK-NEXT:    smaxv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: smaxv_v3i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #-2147483648 // =0x80000000
+; CHECK-SD-NEXT:    mov v0.s[3], w8
+; CHECK-SD-NEXT:    smaxv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: smaxv_v3i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov w8, #-2147483648 // =0x80000000
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v1.s[3], w8
+; CHECK-GI-NEXT:    smaxv s0, v1.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = call i32 @llvm.vector.reduce.smax.v3i32(<3 x i32> %a)
   ret i32 %arg1
@@ -1313,9 +1350,12 @@ define i16 @uminv_v3i16(<3 x i16> %a) {
 ; CHECK-GI-LABEL: uminv_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
 ; CHECK-GI-NEXT:    mov w8, #65535 // =0xffff
-; CHECK-GI-NEXT:    mov v0.h[3], w8
-; CHECK-GI-NEXT:    uminv h0, v0.4h
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], w8
+; CHECK-GI-NEXT:    uminv h0, v1.4h
 ; CHECK-GI-NEXT:    fmov w0, s0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1369,13 +1409,24 @@ entry:
 }
 
 define i32 @uminv_v3i32(<3 x i32> %a) {
-; CHECK-LABEL: uminv_v3i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    mov v0.s[3], w8
-; CHECK-NEXT:    uminv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uminv_v3i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-SD-NEXT:    mov v0.s[3], w8
+; CHECK-SD-NEXT:    uminv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uminv_v3i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v1.s[3], w8
+; CHECK-GI-NEXT:    uminv s0, v1.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = call i32 @llvm.vector.reduce.umin.v3i32(<3 x i32> %a)
   ret i32 %arg1
@@ -1671,9 +1722,12 @@ define i16 @umaxv_v3i16(<3 x i16> %a) {
 ; CHECK-GI-LABEL: umaxv_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
 ; CHECK-GI-NEXT:    mov w8, #0 // =0x0
-; CHECK-GI-NEXT:    mov v0.h[3], w8
-; CHECK-GI-NEXT:    umaxv h0, v0.4h
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], w8
+; CHECK-GI-NEXT:    umaxv h0, v1.4h
 ; CHECK-GI-NEXT:    fmov w0, s0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1727,12 +1781,22 @@ entry:
 }
 
 define i32 @umaxv_v3i32(<3 x i32> %a) {
-; CHECK-LABEL: umaxv_v3i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov v0.s[3], wzr
-; CHECK-NEXT:    umaxv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: umaxv_v3i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov v0.s[3], wzr
+; CHECK-SD-NEXT:    umaxv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: umaxv_v3i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v1.s[3], wzr
+; CHECK-GI-NEXT:    umaxv s0, v1.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = call i32 @llvm.vector.reduce.umax.v3i32(<3 x i32> %a)
   ret i32 %arg1
diff --git a/llvm/test/CodeGen/AArch64/abs.ll b/llvm/test/CodeGen/AArch64/abs.ll
index 25a14ef9a49ee8..b5794007bdddb0 100644
--- a/llvm/test/CodeGen/AArch64/abs.ll
+++ b/llvm/test/CodeGen/AArch64/abs.ll
@@ -336,9 +336,17 @@ define <3 x i8> @abs_v3i8(<3 x i8> %a){
 ; CHECK-GI-NEXT:    mov v0.b[1], w1
 ; CHECK-GI-NEXT:    mov v0.b[2], w2
 ; CHECK-GI-NEXT:    abs v0.8b, v0.8b
-; CHECK-GI-NEXT:    umov w0, v0.b[0]
-; CHECK-GI-NEXT:    umov w1, v0.b[1]
-; CHECK-GI-NEXT:    umov w2, v0.b[2]
+; CHECK-GI-NEXT:    umov w8, v0.b[0]
+; CHECK-GI-NEXT:    umov w9, v0.b[1]
+; CHECK-GI-NEXT:    mov v1.s[0], w8
+; CHECK-GI-NEXT:    umov w8, v0.b[2]
+; CHECK-GI-NEXT:    mov v1.s[1], w9
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    mov s0, v1.s[1]
+; CHECK-GI-NEXT:    mov s2, v1.s[2]
+; CHECK-GI-NEXT:    fmov w0, s1
+; CHECK-GI-NEXT:    fmov w1, s0
+; CHECK-GI-NEXT:    fmov w2, s2
 ; CHECK-GI-NEXT:    ret
 entry:
   %res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %a, i1 0)
@@ -347,10 +355,66 @@ entry:
 declare <3 x i8> @llvm.abs.v3i8(<3 x i8>, i1)
 
 define <7 x i8> @abs_v7i8(<7 x i8> %a){
-; CHECK-LABEL: abs_v7i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    abs v0.8b, v0.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: abs_v7i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    abs v0.8b, v0.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: abs_v7i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov b1, v0.b[1]
+; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[2]
+; CHECK-GI-NEXT:    mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT:    mov b1, v0.b[3]
+; CHECK-GI-NEXT:    mov v2.b[2], v3.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[4]
+; CHECK-GI-NEXT:    mov v2.b[3], v1.b[0]
+; CHECK-GI-NEXT:    mov b1, v0.b[5]
+; CHECK-GI-NEXT:    mov b0, v0.b[6]
+; CHECK-GI-NEXT:    mov v2.b[4], v3.b[0]
+; CHECK-GI-NEXT:    mov v2.b[5], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[6], v0.b[0]
+; CHECK-GI-NEXT:    abs v0.8b, v2.8b
+; CHECK-GI-NEXT:    mov b1, v0.b[1]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b3, v0.b[3]
+; CHECK-GI-NEXT:    mov b4, v0.b[4]
+; CHECK-GI-NEXT:    mov b5, v0.b[6]
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov b1, v0.b[5]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    fmov w8, s3
+; CHECK-GI-NEXT:    mov v0.h[3], w8
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    mov v0.h[4], w8
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.h[5], w8
+; CHECK-GI-NEXT:    fmov w8, s5
+; CHECK-GI-NEXT:    mov v0.h[6], w8
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    mov h2, v0.h[3]
+; CHECK-GI-NEXT:    mov h3, v0.h[4]
+; CHECK-GI-NEXT:    mov h4, v0.h[5]
+; CHECK-GI-NEXT:    mov h5, v0.h[6]
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov h1, v0.h[2]
+; CHECK-GI-NEXT:    mov v0.b[1], w8
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.b[2], w8
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov v0.b[3], w8
+; CHECK-GI-NEXT:    fmov w8, s3
+; CHECK-GI-NEXT:    mov v0.b[4], w8
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    mov v0.b[5], w8
+; CHECK-GI-NEXT:    fmov w8, s5
+; CHECK-GI-NEXT:    mov v0.b[6], w8
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
 entry:
   %res = call <7 x i8> @llvm.abs.v7i8(<7 x i8> %a, i1 0)
   ret <7 x i8> %res
@@ -358,10 +422,30 @@ entry:
 declare <7 x i8> @llvm.abs.v7i8(<7 x i8>, i1)
 
 define <3 x i16> @abs_v3i16(<3 x i16> %a){
-; CHECK-LABEL: abs_v3i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    abs v0.4h, v0.4h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: abs_v3i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    abs v0.4h, v0.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: abs_v3i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    abs v1.4h, v1.4h
+; CHECK-GI-NEXT:    umov w8, v1.h[0]
+; CHECK-GI-NEXT:    umov w9, v1.h[1]
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    umov w8, v1.h[2]
+; CHECK-GI-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
 entry:
   %res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %a, i1 0)
   ret <3 x i16> %res
@@ -369,10 +453,29 @@ entry:
 declare <3 x i16> @llvm.abs.v3i16(<3 x i16>, i1)
 
 define <7 x i16> @abs_v7i16(<7 x i16> %a){
-; CHECK-LABEL: abs_v7i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    abs v0.8h, v0.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: abs_v7i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    abs v0.8h, v0.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: abs_v7i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT:    mov v1.h[4], v0.h[4]
+; CHECK-GI-NEXT:    mov v1.h[5], v0.h[5]
+; CHECK-GI-NEXT:    mov v1.h[6], v0.h[6]
+; CHECK-GI-NEXT:    abs v1.8h, v1.8h
+; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT:    ret
 entry:
   %res = call <7 x i16> @llvm.abs.v7i16(<7 x i16> %a, i1 0)
   ret <7 x i16> %res
@@ -380,10 +483,21 @@ entry:
 declare <7 x i16> @llvm.abs.v7i16(<7 x i16>, i1)
 
 define <3 x i32> @abs_v3i32(<3 x i32> %a){
-; CHECK-LABEL: abs_v3i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    abs v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: abs_v3i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    abs v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: abs_v3i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    abs v1.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %res = call <3 x i32> @llvm.abs.v3i32(<3 x i32> %a, i1 0)
   ret <3 x i32> %res
diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll
index e3072dc41d933c..5d11deaac40bee 100644
--- a/llvm/test/CodeGen/AArch64/add.ll
+++ b/llvm/test/CodeGen/AArch64/add.ll
@@ -343,10 +343,24 @@ entry:
 }
 
 define <3 x i32> @v3i32(<3 x i32> %d, <3 x i32> %e) {
-; CHECK-LABEL: v3i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v3i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v3i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT:    add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %s = add <3 x i32> %d, %e
   ret <3 x i32> %s
@@ -408,8 +422,9 @@ define <3 x i64> @v3i64(<3 x i64> %d, <3 x i64> %e) {
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
 ; CHECK-GI-NEXT:    add x8, x8, x9
-; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    mov v2.d[0], x8
 ; CHECK-GI-NEXT:    add v0.2d, v0.2d, v3.2d
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll
index 5c7429aebb31e9..70477b0c98c77a 100644
--- a/llvm/test/CodeGen/AArch64/andorxor.ll
+++ b/llvm/test/CodeGen/AArch64/andorxor.ll
@@ -1050,30 +1050,72 @@ entry:
 }
 
 define <3 x i32> @and_v3i32(<3 x i32> %d, <3 x i32> %e) {
-; CHECK-LABEL: and_v3i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: and_v3i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_v3i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT:    and v1.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %s = and <3 x i32> %d, %e
   ret <3 x i32> %s
 }
 
 define <3 x i32> @or_v3i32(<3 x i32> %d, <3 x i32> %e) {
-; CHECK-LABEL: or_v3i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: or_v3i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: or_v3i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT:    orr v1.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %s = or <3 x i32> %d, %e
   ret <3 x i32> %s
 }
 
 define <3 x i32> @xor_v3i32(<3 x i32> %d, <3 x i32> %e) {
-; CHECK-LABEL: xor_v3i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: xor_v3i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: xor_v3i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT:    eor v1.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %s = xor <3 x i32> %d, %e
   ret <3 x i32> %s
@@ -1209,8 +1251,9 @@ define <3 x i64> @and_v3i64(<3 x i64> %d, <3 x i64> %e) {
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
 ; CHECK-GI-NEXT:    and x8, x8, x9
-; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    mov v2.d[0], x8
 ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
@@ -1238,8 +1281,9 @@ define <3 x i64> @or_v3i64(<3 x i64> %d, <3 x i64> %e) {
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
 ; CHECK-GI-NEXT:    orr x8, x8, x9
-; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    mov v2.d[0], x8
 ; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
@@ -1267,8 +1311,9 @@ define <3 x i64> @xor_v3i64(<3 x i64> %d, <3 x i64> %e) {
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
 ; CHECK-GI-NEXT:    eor x8, x8, x9
-; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    mov v2.d[0], x8
 ; CHECK-GI-NEXT:    eor v0.16b, v0.16b, v3.16b
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-fp128.ll b/llvm/test/CodeGen/AArch64/arm64-fp128.ll
index 7eb26096ed1566..8ec8ba877d854e 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fp128.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fp128.ll
@@ -1216,9 +1216,12 @@ define <2 x half> @vec_round_f16(<2 x fp128> %val) {
 ; CHECK-GI-NEXT:    bl __trunctfhf2
 ; CHECK-GI-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    bl __trunctfhf2
-; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[0]
+; CHECK-GI-NEXT:    mov h0, v1.h[1]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[0]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll
index 39f2572d9fd354..8449b69a473d92 100644
--- a/llvm/test/CodeGen/AArch64/bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast.ll
@@ -634,10 +634,27 @@ define <8 x i64> @bitcast_v16i32_v8i64(<16 x i32> %a, <16 x i32> %b){
 ; ===== Vectors with Non-Pow 2 Widths =====
 
 define <6 x i16> @bitcast_v3i32_v6i16(<3 x i32> %a, <3 x i32> %b){
-; CHECK-LABEL: bitcast_v3i32_v6i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: bitcast_v3i32_v6i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: bitcast_v3i32_v6i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT:    add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT:    ret
   %c = add <3 x i32> %a, %b
   %d = bitcast <3 x i32> %c to <6 x i16>
   ret <6 x i16> %d
diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll
index 74e4a167ae14ca..9f9653fcbb50b5 100644
--- a/llvm/test/CodeGen/AArch64/bswap.ll
+++ b/llvm/test/CodeGen/AArch64/bswap.ll
@@ -246,10 +246,30 @@ declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>)
 ; ===== Vectors with Non-Pow 2 Widths =====
 
 define <3 x i16> @bswap_v3i16(<3 x i16> %a){
-; CHECK-LABEL: bswap_v3i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rev16 v0.8b, v0.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: bswap_v3i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: bswap_v3i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    rev16 v1.8b, v1.8b
+; CHECK-GI-NEXT:    umov w8, v1.h[0]
+; CHECK-GI-NEXT:    umov w9, v1.h[1]
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    umov w8, v1.h[2]
+; CHECK-GI-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
 entry:
   %res = call <3 x i16> @llvm.bswap.v3i16(<3 x i16> %a)
   ret <3 x i16> %res
@@ -257,10 +277,29 @@ entry:
 declare <3 x i16> @llvm.bswap.v3i16(<3 x i16>)
 
 define <7 x i16> @bswap_v7i16(<7 x i16> %a){
-; CHECK-LABEL: bswap_v7i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rev16 v0.16b, v0.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: bswap_v7i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    rev16 v0.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: bswap_v7i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT:    mov v1.h[4], v0.h[4]
+; CHECK-GI-NEXT:    mov v1.h[5], v0.h[5]
+; CHECK-GI-NEXT:    mov v1.h[6], v0.h[6]
+; CHECK-GI-NEXT:    rev16 v1.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT:    ret
 entry:
   %res = call <7 x i16> @llvm.bswap.v7i16(<7 x i16> %a)
   ret <7 x i16> %res
@@ -268,10 +307,21 @@ entry:
 declare <7 x i16> @llvm.bswap.v7i16(<7 x i16>)
 
 define <3 x i32> @bswap_v3i32(<3 x i32> %a){
-; CHECK-LABEL: bswap_v3i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rev32 v0.16b, v0.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: bswap_v3i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    rev32 v0.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: bswap_v3i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    rev32 v1.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %res = call <3 x i32> @llvm.bswap.v3i32(<3 x i32> %a)
   ret <3 x i32> %res
diff --git a/llvm/test/CodeGen/AArch64/fabs.ll b/llvm/test/CodeGen/AArch64/fabs.ll
index 43e90070736345..1aed6cb8bf9ed8 100644
--- a/llvm/test/CodeGen/AArch64/fabs.ll
+++ b/llvm/test/CodeGen/AArch64/fabs.ll
@@ -88,6 +88,7 @@ define <3 x double> @fabs_v3f64(<3 x double> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    fabs d2, d2
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    fabs v0.2d, v0.2d
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -119,10 +120,21 @@ entry:
 }
 
 define <3 x float> @fabs_v3f32(<3 x float> %a) {
-; CHECK-LABEL: fabs_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fabs v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fabs_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fabs v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fabs_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    fabs v1.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = call <3 x float> @llvm.fabs.v3f32(<3 x float> %a)
   ret <3 x float> %c
@@ -162,13 +174,41 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fabs_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mvni v1.8h, #128, lsl #8
-; CHECK-GI-NOFP16-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[4], v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[5], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[6], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mvni v0.8h, #128, lsl #8
+; CHECK-GI-NOFP16-NEXT:    and v1.16b, v1.16b, v0.16b
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fabs_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fabs v0.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov v1.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT:    mov v1.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT:    mov v1.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT:    fabs v1.8h, v1.8h
+; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.fabs.v7f16(<7 x half> %a)
diff --git a/llvm/test/CodeGen/AArch64/faddsub.ll b/llvm/test/CodeGen/AArch64/faddsub.ll
index b15579199a0598..4227c891d844f4 100644
--- a/llvm/test/CodeGen/AArch64/faddsub.ll
+++ b/llvm/test/CodeGen/AArch64/faddsub.ll
@@ -93,6 +93,7 @@ define <3 x double> @fadd_v3f64(<3 x double> %a, <3 x double> %b) {
 ; CHECK-GI-NEXT:    fadd d2, d2, d5
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    fadd v0.2d, v0.2d, v3.2d
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -130,10 +131,24 @@ entry:
 }
 
 define <3 x float> @fadd_v3f32(<3 x float> %a, <3 x float> %b) {
-; CHECK-LABEL: fadd_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fadd_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fadd_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT:    fadd v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fadd <3 x float> %a, %b
   ret <3 x float> %c
@@ -186,32 +201,68 @@ define <7 x half> @fadd_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fadd_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    fadd v2.4s, v2.4s, v3.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v1.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    fadd v0.4s, v0.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[4], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[5], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[6], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fadd_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fadd v0.8h, v0.8h, v1.8h
+; CHECK-GI-FP16-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT:    mov v2.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT:    mov v3.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT:    mov v2.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT:    mov v3.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT:    mov v2.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT:    mov v3.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT:    fadd v1.8h, v2.8h, v3.8h
+; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fadd <7 x half> %a, %b
@@ -434,6 +485,7 @@ define <3 x double> @fsub_v3f64(<3 x double> %a, <3 x double> %b) {
 ; CHECK-GI-NEXT:    fsub d2, d2, d5
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    fsub v0.2d, v0.2d, v3.2d
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -471,10 +523,24 @@ entry:
 }
 
 define <3 x float> @fsub_v3f32(<3 x float> %a, <3 x float> %b) {
-; CHECK-LABEL: fsub_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fsub v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fsub_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fsub v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fsub_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT:    fsub v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fsub <3 x float> %a, %b
   ret <3 x float> %c
@@ -527,32 +593,68 @@ define <7 x half> @fsub_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fsub_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    fsub v2.4s, v2.4s, v3.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    fsub v1.4s, v1.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    fsub v0.4s, v0.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[4], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[5], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[6], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fsub_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fsub v0.8h, v0.8h, v1.8h
+; CHECK-GI-FP16-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT:    mov v2.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT:    mov v3.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT:    mov v2.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT:    mov v3.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT:    mov v2.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT:    mov v3.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT:    fsub v1.8h, v2.8h, v3.8h
+; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fsub <7 x half> %a, %b
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index 66f26fc9d85973..584ffa92493d08 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -783,7 +783,8 @@ define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double>
 ; CHECK-GI-NEXT:    ldp x20, x19, [sp, #160] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    orr x8, x9, x8
 ; CHECK-GI-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    mov v2.d[0], x8
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    add sp, sp, #176
@@ -856,8 +857,9 @@ define <3 x double> @v3f64_double(<3 x double> %a, <3 x double> %b, <3 x double>
 ; CHECK-GI-NEXT:    and x8, x8, x9
 ; CHECK-GI-NEXT:    bic x9, x10, x9
 ; CHECK-GI-NEXT:    orr x8, x8, x9
-; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    mov v2.d[0], x8
 ; CHECK-GI-NEXT:    bsl v0.16b, v6.16b, v1.16b
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
@@ -930,24 +932,33 @@ define <3 x i32> @v3f64_i32(<3 x double> %a, <3 x double> %b, <3 x i32> %d, <3 x
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
 ; CHECK-GI-NEXT:    mov v1.s[0], w8
+; CHECK-GI-NEXT:    mov v4.s[0], v7.s[0]
 ; CHECK-GI-NEXT:    cset w9, mi
 ; CHECK-GI-NEXT:    mov v2.s[0], w9
 ; CHECK-GI-NEXT:    mov w9, #-1 // =0xffffffff
 ; CHECK-GI-NEXT:    fcmgt v0.2d, v3.2d, v0.2d
 ; CHECK-GI-NEXT:    mov v1.s[1], w8
 ; CHECK-GI-NEXT:    mov v3.s[0], w9
+; CHECK-GI-NEXT:    mov v4.s[1], v7.s[1]
 ; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-GI-NEXT:    mov v1.s[2], w8
 ; CHECK-GI-NEXT:    mov v3.s[1], w9
+; CHECK-GI-NEXT:    mov v4.s[2], v7.s[2]
 ; CHECK-GI-NEXT:    mov v0.d[1], v2.d[0]
+; CHECK-GI-NEXT:    mov v2.s[0], v6.s[0]
 ; CHECK-GI-NEXT:    mov v3.s[2], w9
 ; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NEXT:    neg v1.4s, v1.4s
+; CHECK-GI-NEXT:    mov v2.s[1], v6.s[1]
 ; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov v2.s[2], v6.s[2]
 ; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v3.16b
-; CHECK-GI-NEXT:    and v0.16b, v6.16b, v0.16b
-; CHECK-GI-NEXT:    and v1.16b, v7.16b, v1.16b
-; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
+; CHECK-GI-NEXT:    and v1.16b, v4.16b, v1.16b
+; CHECK-GI-NEXT:    orr v1.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt <3 x double> %a, %b
@@ -1000,22 +1011,37 @@ define <3 x float> @v3f32_float(<3 x float> %a, <3 x float> %b, <3 x float> %d,
 ;
 ; CHECK-GI-LABEL: v3f32_float:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v4.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v5.s[0], v1.s[0]
 ; CHECK-GI-NEXT:    mov w8, #31 // =0x1f
+; CHECK-GI-NEXT:    mov v6.s[0], w8
 ; CHECK-GI-NEXT:    mov w9, #-1 // =0xffffffff
-; CHECK-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NEXT:    mov v4.s[0], w8
-; CHECK-GI-NEXT:    mov v5.s[0], w9
-; CHECK-GI-NEXT:    mov v4.s[1], w8
-; CHECK-GI-NEXT:    mov v5.s[1], w9
-; CHECK-GI-NEXT:    mov v4.s[2], w8
-; CHECK-GI-NEXT:    mov v5.s[2], w9
-; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT:    neg v1.4s, v4.4s
-; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v5.16b
-; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
-; CHECK-GI-NEXT:    and v1.16b, v3.16b, v1.16b
-; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v4.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v5.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v6.s[1], w8
+; CHECK-GI-NEXT:    mov v4.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v5.s[2], v1.s[2]
+; CHECK-GI-NEXT:    mov v0.s[0], w9
+; CHECK-GI-NEXT:    mov v6.s[2], w8
+; CHECK-GI-NEXT:    fcmgt v1.4s, v5.4s, v4.4s
+; CHECK-GI-NEXT:    mov v4.s[0], v2.s[0]
+; CHECK-GI-NEXT:    mov v5.s[0], v3.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NEXT:    ushl v1.4s, v1.4s, v6.4s
+; CHECK-GI-NEXT:    neg v6.4s, v6.4s
+; CHECK-GI-NEXT:    mov v4.s[1], v2.s[1]
+; CHECK-GI-NEXT:    mov v5.s[1], v3.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], w9
+; CHECK-GI-NEXT:    sshl v1.4s, v1.4s, v6.4s
+; CHECK-GI-NEXT:    mov v4.s[2], v2.s[2]
+; CHECK-GI-NEXT:    mov v5.s[2], v3.s[2]
+; CHECK-GI-NEXT:    eor v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    and v1.16b, v4.16b, v1.16b
+; CHECK-GI-NEXT:    and v0.16b, v5.16b, v0.16b
+; CHECK-GI-NEXT:    orr v1.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt <3 x float> %a, %b
@@ -1078,22 +1104,37 @@ define <3 x i32> @v3f32_i32(<3 x float> %a, <3 x float> %b, <3 x i32> %d, <3 x i
 ;
 ; CHECK-GI-LABEL: v3f32_i32:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v4.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v5.s[0], v1.s[0]
 ; CHECK-GI-NEXT:    mov w8, #31 // =0x1f
+; CHECK-GI-NEXT:    mov v6.s[0], w8
 ; CHECK-GI-NEXT:    mov w9, #-1 // =0xffffffff
-; CHECK-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NEXT:    mov v4.s[0], w8
-; CHECK-GI-NEXT:    mov v5.s[0], w9
-; CHECK-GI-NEXT:    mov v4.s[1], w8
-; CHECK-GI-NEXT:    mov v5.s[1], w9
-; CHECK-GI-NEXT:    mov v4.s[2], w8
-; CHECK-GI-NEXT:    mov v5.s[2], w9
-; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT:    neg v1.4s, v4.4s
-; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v5.16b
-; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
-; CHECK-GI-NEXT:    and v1.16b, v3.16b, v1.16b
-; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v4.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v5.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v6.s[1], w8
+; CHECK-GI-NEXT:    mov v4.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v5.s[2], v1.s[2]
+; CHECK-GI-NEXT:    mov v0.s[0], w9
+; CHECK-GI-NEXT:    mov v6.s[2], w8
+; CHECK-GI-NEXT:    fcmgt v1.4s, v5.4s, v4.4s
+; CHECK-GI-NEXT:    mov v4.s[0], v2.s[0]
+; CHECK-GI-NEXT:    mov v5.s[0], v3.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NEXT:    ushl v1.4s, v1.4s, v6.4s
+; CHECK-GI-NEXT:    neg v6.4s, v6.4s
+; CHECK-GI-NEXT:    mov v4.s[1], v2.s[1]
+; CHECK-GI-NEXT:    mov v5.s[1], v3.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], w9
+; CHECK-GI-NEXT:    sshl v1.4s, v1.4s, v6.4s
+; CHECK-GI-NEXT:    mov v4.s[2], v2.s[2]
+; CHECK-GI-NEXT:    mov v5.s[2], v3.s[2]
+; CHECK-GI-NEXT:    eor v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    and v1.16b, v4.16b, v1.16b
+; CHECK-GI-NEXT:    and v0.16b, v5.16b, v0.16b
+; CHECK-GI-NEXT:    orr v1.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt <3 x float> %a, %b
@@ -1204,70 +1245,134 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x
 ;
 ; CHECK-GI-NOFP16-LABEL: v7f16_half:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov w8, #15 // =0xf
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[0], v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    fmov s5, w8
+; CHECK-GI-NOFP16-NEXT:    fmov s6, w8
+; CHECK-GI-NOFP16-NEXT:    mov v17.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    mov w9, #65535 // =0xffff
+; CHECK-GI-NOFP16-NEXT:    mov v16.h[0], v1.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fmov s7, w9
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], w8
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v18.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v19.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], w8
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v17.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov v7.h[1], w9
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], w8
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v16.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v18.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v19.h[1], v3.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], w8
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v17.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v7.h[2], w9
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], w8
-; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v6.4h
+; CHECK-GI-NOFP16-NEXT:    mov v16.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v18.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v19.h[2], v3.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[3], w8
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v17.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v7.h[3], w9
-; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[4], w8
-; CHECK-GI-NOFP16-NEXT:    fcmgt v1.4s, v6.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT:    mov v18.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v19.h[3], v3.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v16.4h
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[4], w8
 ; CHECK-GI-NOFP16-NEXT:    mov v7.h[4], w9
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[5], w8
-; CHECK-GI-NOFP16-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-GI-NOFP16-NEXT:    mov v18.h[4], v2.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v19.h[4], v3.h[4]
+; CHECK-GI-NOFP16-NEXT:    fcmgt v1.4s, v4.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v5.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[5], w8
 ; CHECK-GI-NOFP16-NEXT:    mov v7.h[5], w9
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[6], w8
+; CHECK-GI-NOFP16-NEXT:    mov v18.h[5], v2.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v19.h[5], v3.h[5]
+; CHECK-GI-NOFP16-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[6], w8
 ; CHECK-GI-NOFP16-NEXT:    mov v7.h[6], w9
-; CHECK-GI-NOFP16-NEXT:    ushl v0.8h, v0.8h, v5.8h
-; CHECK-GI-NOFP16-NEXT:    neg v1.8h, v5.8h
+; CHECK-GI-NOFP16-NEXT:    mov v18.h[6], v2.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v19.h[6], v3.h[6]
+; CHECK-GI-NOFP16-NEXT:    ushl v0.8h, v0.8h, v6.8h
+; CHECK-GI-NOFP16-NEXT:    neg v1.8h, v6.8h
 ; CHECK-GI-NOFP16-NEXT:    sshl v0.8h, v0.8h, v1.8h
 ; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v0.16b, v7.16b
-; CHECK-GI-NOFP16-NEXT:    and v0.16b, v2.16b, v0.16b
-; CHECK-GI-NOFP16-NEXT:    and v1.16b, v3.16b, v1.16b
-; CHECK-GI-NOFP16-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT:    and v0.16b, v18.16b, v0.16b
+; CHECK-GI-NOFP16-NEXT:    and v1.16b, v19.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT:    orr v1.16b, v0.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: v7f16_half:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
+; CHECK-GI-FP16-NEXT:    mov v4.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v5.h[0], v1.h[0]
 ; CHECK-GI-FP16-NEXT:    mov w8, #15 // =0xf
+; CHECK-GI-FP16-NEXT:    fmov s6, w8
 ; CHECK-GI-FP16-NEXT:    mov w9, #65535 // =0xffff
-; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
-; CHECK-GI-FP16-NEXT:    fmov s4, w8
-; CHECK-GI-FP16-NEXT:    fmov s5, w9
-; CHECK-GI-FP16-NEXT:    mov v4.h[1], w8
-; CHECK-GI-FP16-NEXT:    mov v5.h[1], w9
-; CHECK-GI-FP16-NEXT:    mov v4.h[2], w8
-; CHECK-GI-FP16-NEXT:    mov v5.h[2], w9
-; CHECK-GI-FP16-NEXT:    mov v4.h[3], w8
-; CHECK-GI-FP16-NEXT:    mov v5.h[3], w9
-; CHECK-GI-FP16-NEXT:    mov v4.h[4], w8
-; CHECK-GI-FP16-NEXT:    mov v5.h[4], w9
-; CHECK-GI-FP16-NEXT:    mov v4.h[5], w8
-; CHECK-GI-FP16-NEXT:    mov v5.h[5], w9
-; CHECK-GI-FP16-NEXT:    mov v4.h[6], w8
-; CHECK-GI-FP16-NEXT:    mov v5.h[6], w9
-; CHECK-GI-FP16-NEXT:    ushl v0.8h, v0.8h, v4.8h
-; CHECK-GI-FP16-NEXT:    neg v1.8h, v4.8h
+; CHECK-GI-FP16-NEXT:    mov v16.h[0], v2.h[0]
+; CHECK-GI-FP16-NEXT:    fmov s7, w9
+; CHECK-GI-FP16-NEXT:    mov v17.h[0], v3.h[0]
+; CHECK-GI-FP16-NEXT:    mov v4.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v5.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v6.h[1], w8
+; CHECK-GI-FP16-NEXT:    mov v7.h[1], w9
+; CHECK-GI-FP16-NEXT:    mov v16.h[1], v2.h[1]
+; CHECK-GI-FP16-NEXT:    mov v17.h[1], v3.h[1]
+; CHECK-GI-FP16-NEXT:    mov v4.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov v5.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v6.h[2], w8
+; CHECK-GI-FP16-NEXT:    mov v7.h[2], w9
+; CHECK-GI-FP16-NEXT:    mov v16.h[2], v2.h[2]
+; CHECK-GI-FP16-NEXT:    mov v17.h[2], v3.h[2]
+; CHECK-GI-FP16-NEXT:    mov v4.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov v5.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT:    mov v6.h[3], w8
+; CHECK-GI-FP16-NEXT:    mov v7.h[3], w9
+; CHECK-GI-FP16-NEXT:    mov v16.h[3], v2.h[3]
+; CHECK-GI-FP16-NEXT:    mov v17.h[3], v3.h[3]
+; CHECK-GI-FP16-NEXT:    mov v4.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT:    mov v5.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT:    mov v6.h[4], w8
+; CHECK-GI-FP16-NEXT:    mov v7.h[4], w9
+; CHECK-GI-FP16-NEXT:    mov v16.h[4], v2.h[4]
+; CHECK-GI-FP16-NEXT:    mov v17.h[4], v3.h[4]
+; CHECK-GI-FP16-NEXT:    mov v4.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT:    mov v5.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT:    mov v6.h[5], w8
+; CHECK-GI-FP16-NEXT:    mov v7.h[5], w9
+; CHECK-GI-FP16-NEXT:    mov v16.h[5], v2.h[5]
+; CHECK-GI-FP16-NEXT:    mov v17.h[5], v3.h[5]
+; CHECK-GI-FP16-NEXT:    mov v4.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT:    mov v5.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT:    mov v6.h[6], w8
+; CHECK-GI-FP16-NEXT:    mov v7.h[6], w9
+; CHECK-GI-FP16-NEXT:    mov v16.h[6], v2.h[6]
+; CHECK-GI-FP16-NEXT:    mov v17.h[6], v3.h[6]
+; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v5.8h, v4.8h
+; CHECK-GI-FP16-NEXT:    neg v1.8h, v6.8h
+; CHECK-GI-FP16-NEXT:    ushl v0.8h, v0.8h, v6.8h
 ; CHECK-GI-FP16-NEXT:    sshl v0.8h, v0.8h, v1.8h
-; CHECK-GI-FP16-NEXT:    eor v1.16b, v0.16b, v5.16b
-; CHECK-GI-FP16-NEXT:    and v0.16b, v2.16b, v0.16b
-; CHECK-GI-FP16-NEXT:    and v1.16b, v3.16b, v1.16b
-; CHECK-GI-FP16-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-FP16-NEXT:    eor v1.16b, v0.16b, v7.16b
+; CHECK-GI-FP16-NEXT:    and v0.16b, v16.16b, v0.16b
+; CHECK-GI-FP16-NEXT:    and v1.16b, v17.16b, v1.16b
+; CHECK-GI-FP16-NEXT:    orr v1.16b, v0.16b, v1.16b
+; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fcmp olt <7 x half> %a, %b
@@ -1690,61 +1795,69 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ;
 ; CHECK-GI-NOFP16-LABEL: v7f16_i32:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[0], v1.h[4]
 ; CHECK-GI-NOFP16-NEXT:    mov w8, #31 // =0x1f
-; CHECK-GI-NOFP16-NEXT:    mov v4.s[0], w8
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov w9, #-1 // =0xffffffff
-; CHECK-GI-NOFP16-NEXT:    mov v5.s[0], w0
-; CHECK-GI-NOFP16-NEXT:    mov v6.s[0], w9
-; CHECK-GI-NOFP16-NEXT:    mov v7.s[0], w7
-; CHECK-GI-NOFP16-NEXT:    ldr s16, [sp]
-; CHECK-GI-NOFP16-NEXT:    ldr s17, [sp, #24]
-; CHECK-GI-NOFP16-NEXT:    ldr s18, [sp, #32]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v4.s[1], w8
-; CHECK-GI-NOFP16-NEXT:    mov v5.s[1], w1
+; CHECK-GI-NOFP16-NEXT:    mov v6.s[0], w8
+; CHECK-GI-NOFP16-NEXT:    mov v16.s[0], w9
+; CHECK-GI-NOFP16-NEXT:    ldr s18, [sp]
+; CHECK-GI-NOFP16-NEXT:    mov v7.s[0], w0
+; CHECK-GI-NOFP16-NEXT:    mov v17.s[0], w7
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v6.s[1], w8
+; CHECK-GI-NOFP16-NEXT:    mov v16.s[1], w9
+; CHECK-GI-NOFP16-NEXT:    mov v7.s[1], w1
 ; CHECK-GI-NOFP16-NEXT:    mov v17.s[1], v18.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v6.s[1], w9
-; CHECK-GI-NOFP16-NEXT:    mov v7.s[1], v16.s[0]
-; CHECK-GI-NOFP16-NEXT:    ldr s16, [sp, #8]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov v4.s[2], w8
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v5.s[2], w2
-; CHECK-GI-NOFP16-NEXT:    mov v6.s[2], w9
-; CHECK-GI-NOFP16-NEXT:    mov v7.s[2], v16.s[0]
-; CHECK-GI-NOFP16-NEXT:    ldr s16, [sp, #40]
+; CHECK-GI-NOFP16-NEXT:    ldr s18, [sp, #32]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v6.s[2], w8
+; CHECK-GI-NOFP16-NEXT:    mov v16.s[2], w9
+; CHECK-GI-NOFP16-NEXT:    mov v7.s[2], w2
+; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], w4
+; CHECK-GI-NOFP16-NEXT:    mov v7.s[3], w3
+; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v5.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT:    ldr s5, [sp, #24]
+; CHECK-GI-NOFP16-NEXT:    ldr s4, [sp, #8]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], w5
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    mov v17.s[2], v16.s[0]
-; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v5.s[3], w3
+; CHECK-GI-NOFP16-NEXT:    mov v5.s[1], v18.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v17.s[2], v4.s[0]
+; CHECK-GI-NOFP16-NEXT:    ldr s4, [sp, #40]
+; CHECK-GI-NOFP16-NEXT:    ushl v0.4s, v0.4s, v6.4s
+; CHECK-GI-NOFP16-NEXT:    neg v6.4s, v6.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[2], w6
 ; CHECK-GI-NOFP16-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v3.s[0], w4
-; CHECK-GI-NOFP16-NEXT:    ushl v2.4s, v2.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT:    neg v4.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT:    mov v3.s[1], w5
-; CHECK-GI-NOFP16-NEXT:    sshl v2.4s, v2.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT:    ldr s4, [sp, #16]
-; CHECK-GI-NOFP16-NEXT:    mov v3.s[2], w6
-; CHECK-GI-NOFP16-NEXT:    mov v7.s[3], v4.s[0]
-; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v2.16b, v6.16b
-; CHECK-GI-NOFP16-NEXT:    and v2.16b, v3.16b, v2.16b
-; CHECK-GI-NOFP16-NEXT:    and v1.16b, v17.16b, v1.16b
-; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v5.16b, v7.16b
-; CHECK-GI-NOFP16-NEXT:    orr v1.16b, v2.16b, v1.16b
-; CHECK-GI-NOFP16-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov s3, v0.s[2]
-; CHECK-GI-NOFP16-NEXT:    mov s4, v0.s[3]
-; CHECK-GI-NOFP16-NEXT:    fmov w0, s0
-; CHECK-GI-NOFP16-NEXT:    mov s5, v1.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov s6, v1.s[2]
-; CHECK-GI-NOFP16-NEXT:    fmov w4, s1
-; CHECK-GI-NOFP16-NEXT:    fmov w1, s2
+; CHECK-GI-NOFP16-NEXT:    mov v5.s[2], v4.s[0]
+; CHECK-GI-NOFP16-NEXT:    sshl v0.4s, v0.4s, v6.4s
+; CHECK-GI-NOFP16-NEXT:    ldr s6, [sp, #16]
+; CHECK-GI-NOFP16-NEXT:    mov v17.s[3], v6.s[0]
+; CHECK-GI-NOFP16-NEXT:    eor v3.16b, v0.16b, v16.16b
+; CHECK-GI-NOFP16-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-GI-NOFP16-NEXT:    and v1.16b, v5.16b, v3.16b
+; CHECK-GI-NOFP16-NEXT:    bsl v2.16b, v7.16b, v17.16b
+; CHECK-GI-NOFP16-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT:    mov s1, v2.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov s3, v2.s[2]
+; CHECK-GI-NOFP16-NEXT:    mov s4, v2.s[3]
+; CHECK-GI-NOFP16-NEXT:    fmov w0, s2
+; CHECK-GI-NOFP16-NEXT:    mov s5, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov s6, v0.s[2]
+; CHECK-GI-NOFP16-NEXT:    fmov w4, s0
+; CHECK-GI-NOFP16-NEXT:    fmov w1, s1
 ; CHECK-GI-NOFP16-NEXT:    fmov w2, s3
 ; CHECK-GI-NOFP16-NEXT:    fmov w3, s4
 ; CHECK-GI-NOFP16-NEXT:    fmov w5, s5
@@ -1753,37 +1866,51 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ;
 ; CHECK-GI-FP16-LABEL: v7f16_i32:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v3.h[0], v1.h[0]
 ; CHECK-GI-FP16-NEXT:    mov w9, #31 // =0x1f
 ; CHECK-GI-FP16-NEXT:    mov v4.s[0], w0
-; CHECK-GI-FP16-NEXT:    mov v2.s[0], w9
 ; CHECK-GI-FP16-NEXT:    mov v5.s[0], w7
 ; CHECK-GI-FP16-NEXT:    ldr s6, [sp]
 ; CHECK-GI-FP16-NEXT:    mov v7.s[0], w4
 ; CHECK-GI-FP16-NEXT:    ldr s16, [sp, #32]
 ; CHECK-GI-FP16-NEXT:    ldr s17, [sp, #8]
-; CHECK-GI-FP16-NEXT:    umov w8, v0.h[4]
-; CHECK-GI-FP16-NEXT:    umov w10, v0.h[5]
+; CHECK-GI-FP16-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v3.h[1], v1.h[1]
 ; CHECK-GI-FP16-NEXT:    mov v4.s[1], w1
-; CHECK-GI-FP16-NEXT:    mov v2.s[1], w9
 ; CHECK-GI-FP16-NEXT:    mov v5.s[1], v6.s[0]
 ; CHECK-GI-FP16-NEXT:    ldr s6, [sp, #24]
 ; CHECK-GI-FP16-NEXT:    mov v7.s[1], w5
 ; CHECK-GI-FP16-NEXT:    mov v6.s[1], v16.s[0]
 ; CHECK-GI-FP16-NEXT:    ldr s16, [sp, #40]
-; CHECK-GI-FP16-NEXT:    mov v1.s[0], w8
-; CHECK-GI-FP16-NEXT:    umov w8, v0.h[6]
-; CHECK-GI-FP16-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-FP16-NEXT:    mov v2.s[2], w9
+; CHECK-GI-FP16-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov v3.h[2], v1.h[2]
 ; CHECK-GI-FP16-NEXT:    mov v4.s[2], w2
 ; CHECK-GI-FP16-NEXT:    mov v5.s[2], v17.s[0]
 ; CHECK-GI-FP16-NEXT:    mov v7.s[2], w6
-; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
 ; CHECK-GI-FP16-NEXT:    mov v6.s[2], v16.s[0]
+; CHECK-GI-FP16-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT:    mov v4.s[3], w3
+; CHECK-GI-FP16-NEXT:    mov v2.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT:    mov v3.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT:    mov v2.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT:    mov v3.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT:    mov v2.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT:    mov v3.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v3.8h, v2.8h
+; CHECK-GI-FP16-NEXT:    mov v2.s[0], w9
+; CHECK-GI-FP16-NEXT:    umov w8, v0.h[4]
+; CHECK-GI-FP16-NEXT:    umov w10, v0.h[5]
+; CHECK-GI-FP16-NEXT:    mov v2.s[1], w9
+; CHECK-GI-FP16-NEXT:    mov v1.s[0], w8
+; CHECK-GI-FP16-NEXT:    umov w8, v0.h[6]
+; CHECK-GI-FP16-NEXT:    mov v2.s[2], w9
+; CHECK-GI-FP16-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-FP16-NEXT:    mov v1.s[1], w10
 ; CHECK-GI-FP16-NEXT:    mov w10, #-1 // =0xffffffff
+; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
 ; CHECK-GI-FP16-NEXT:    mov v3.s[0], w10
-; CHECK-GI-FP16-NEXT:    mov v4.s[3], w3
 ; CHECK-GI-FP16-NEXT:    sshr v0.4s, v0.4s, #31
 ; CHECK-GI-FP16-NEXT:    mov v1.s[2], w8
 ; CHECK-GI-FP16-NEXT:    mov v3.s[1], w10
diff --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign.ll
index a42ec8e253be29..7f07b088182cae 100644
--- a/llvm/test/CodeGen/AArch64/fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/fcopysign.ll
@@ -111,7 +111,8 @@ define <3 x double> @copysign_v3f64(<3 x double> %a, <3 x double> %b) {
 ; CHECK-GI-NEXT:    and x9, x9, #0x8000000000000000
 ; CHECK-GI-NEXT:    fneg v1.2d, v6.2d
 ; CHECK-GI-NEXT:    orr x8, x8, x9
-; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    mov v2.d[0], x8
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    bif v0.16b, v3.16b, v1.16b
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -156,15 +157,24 @@ define <3 x float> @copysign_v3f32(<3 x float> %a, <3 x float> %b) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov w8, #-2147483648 // =0x80000000
 ; CHECK-GI-NEXT:    mov w9, #2147483647 // =0x7fffffff
-; CHECK-GI-NEXT:    mov v2.s[0], w9
-; CHECK-GI-NEXT:    mov v3.s[0], w8
-; CHECK-GI-NEXT:    mov v2.s[1], w9
-; CHECK-GI-NEXT:    mov v3.s[1], w8
-; CHECK-GI-NEXT:    mov v2.s[2], w9
-; CHECK-GI-NEXT:    mov v3.s[2], w8
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-GI-NEXT:    and v1.16b, v1.16b, v3.16b
-; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v3.s[0], w9
+; CHECK-GI-NEXT:    mov v4.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v5.s[0], w8
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v3.s[1], w9
+; CHECK-GI-NEXT:    mov v4.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v5.s[1], w8
+; CHECK-GI-NEXT:    mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v4.s[2], v1.s[2]
+; CHECK-GI-NEXT:    mov v3.s[2], w9
+; CHECK-GI-NEXT:    mov v5.s[2], w8
+; CHECK-GI-NEXT:    and v0.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    and v1.16b, v4.16b, v5.16b
+; CHECK-GI-NEXT:    orr v1.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call <3 x float> @llvm.copysign.v3f32(<3 x float> %a, <3 x float> %b)
@@ -203,25 +213,46 @@ define <7 x half> @copysign_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-GI-LABEL: copysign_v7f16:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v3.h[0], v1.h[0]
 ; CHECK-GI-NEXT:    mov w8, #32768 // =0x8000
 ; CHECK-GI-NEXT:    mov w9, #32767 // =0x7fff
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    fmov s3, w8
-; CHECK-GI-NEXT:    mov v2.h[1], w9
-; CHECK-GI-NEXT:    mov v3.h[1], w8
-; CHECK-GI-NEXT:    mov v2.h[2], w9
-; CHECK-GI-NEXT:    mov v3.h[2], w8
-; CHECK-GI-NEXT:    mov v2.h[3], w9
-; CHECK-GI-NEXT:    mov v3.h[3], w8
-; CHECK-GI-NEXT:    mov v2.h[4], w9
-; CHECK-GI-NEXT:    mov v3.h[4], w8
-; CHECK-GI-NEXT:    mov v2.h[5], w9
-; CHECK-GI-NEXT:    mov v3.h[5], w8
-; CHECK-GI-NEXT:    mov v2.h[6], w9
-; CHECK-GI-NEXT:    mov v3.h[6], w8
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-GI-NEXT:    and v1.16b, v1.16b, v3.16b
-; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v5.h[1], w8
+; CHECK-GI-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v4.h[1], w9
+; CHECK-GI-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v5.h[2], w8
+; CHECK-GI-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v4.h[2], w9
+; CHECK-GI-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-GI-NEXT:    mov v5.h[3], w8
+; CHECK-GI-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-GI-NEXT:    mov v4.h[3], w9
+; CHECK-GI-NEXT:    mov v2.h[4], v0.h[4]
+; CHECK-GI-NEXT:    mov v5.h[4], w8
+; CHECK-GI-NEXT:    mov v3.h[4], v1.h[4]
+; CHECK-GI-NEXT:    mov v4.h[4], w9
+; CHECK-GI-NEXT:    mov v2.h[5], v0.h[5]
+; CHECK-GI-NEXT:    mov v5.h[5], w8
+; CHECK-GI-NEXT:    mov v3.h[5], v1.h[5]
+; CHECK-GI-NEXT:    mov v4.h[5], w9
+; CHECK-GI-NEXT:    mov v2.h[6], v0.h[6]
+; CHECK-GI-NEXT:    mov v3.h[6], v1.h[6]
+; CHECK-GI-NEXT:    mov v5.h[6], w8
+; CHECK-GI-NEXT:    mov v4.h[6], w9
+; CHECK-GI-NEXT:    and v1.16b, v3.16b, v5.16b
+; CHECK-GI-NEXT:    and v0.16b, v2.16b, v4.16b
+; CHECK-GI-NEXT:    orr v1.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.copysign.v7f16(<7 x half> %a, <7 x half> %b)
diff --git a/llvm/test/CodeGen/AArch64/fcvt.ll b/llvm/test/CodeGen/AArch64/fcvt.ll
index b408e9c1bd4e60..55d9984c6392f5 100644
--- a/llvm/test/CodeGen/AArch64/fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt.ll
@@ -84,6 +84,7 @@ define <3 x double> @ceil_v3f64(<3 x double> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    frintp d2, d2
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    frintp v0.2d, v0.2d
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -115,10 +116,21 @@ entry:
 }
 
 define <3 x float> @ceil_v3f32(<3 x float> %a) {
-; CHECK-LABEL: ceil_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    frintp v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ceil_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    frintp v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ceil_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    frintp v1.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = call <3 x float> @llvm.ceil.v3f32(<3 x float> %a)
   ret <3 x float> %c
@@ -163,27 +175,52 @@ define <7 x half> @ceil_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: ceil_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    frintp v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    frintp v2.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    frintp v0.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    frintp v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[4], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[5], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[6], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v3.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v3.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v3.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v3.h[6]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: ceil_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    frintp v0.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov v1.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT:    mov v1.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT:    mov v1.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT:    frintp v1.8h, v1.8h
+; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.ceil.v7f16(<7 x half> %a)
@@ -383,6 +420,7 @@ define <3 x double> @floor_v3f64(<3 x double> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    frintm d2, d2
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    frintm v0.2d, v0.2d
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -414,10 +452,21 @@ entry:
 }
 
 define <3 x float> @floor_v3f32(<3 x float> %a) {
-; CHECK-LABEL: floor_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    frintm v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: floor_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    frintm v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: floor_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    frintm v1.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = call <3 x float> @llvm.floor.v3f32(<3 x float> %a)
   ret <3 x float> %c
@@ -462,27 +511,52 @@ define <7 x half> @floor_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: floor_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    frintm v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    frintm v2.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    frintm v0.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    frintm v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[4], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[5], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[6], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v3.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v3.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v3.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v3.h[6]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: floor_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    frintm v0.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov v1.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT:    mov v1.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT:    mov v1.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT:    frintm v1.8h, v1.8h
+; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.floor.v7f16(<7 x half> %a)
@@ -682,6 +756,7 @@ define <3 x double> @nearbyint_v3f64(<3 x double> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    frinti d2, d2
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    frinti v0.2d, v0.2d
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -713,10 +788,21 @@ entry:
 }
 
 define <3 x float> @nearbyint_v3f32(<3 x float> %a) {
-; CHECK-LABEL: nearbyint_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    frinti v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: nearbyint_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    frinti v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: nearbyint_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    frinti v1.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = call <3 x float> @llvm.nearbyint.v3f32(<3 x float> %a)
   ret <3 x float> %c
@@ -761,27 +847,52 @@ define <7 x half> @nearbyint_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: nearbyint_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    frinti v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    frinti v2.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    frinti v0.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    frinti v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[4], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[5], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[6], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v3.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v3.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v3.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v3.h[6]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: nearbyint_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    frinti v0.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov v1.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT:    mov v1.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT:    mov v1.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT:    frinti v1.8h, v1.8h
+; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.nearbyint.v7f16(<7 x half> %a)
@@ -981,6 +1092,7 @@ define <3 x double> @roundeven_v3f64(<3 x double> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    frintn d2, d2
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    frintn v0.2d, v0.2d
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -1012,10 +1124,21 @@ entry:
 }
 
 define <3 x float> @roundeven_v3f32(<3 x float> %a) {
-; CHECK-LABEL: roundeven_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    frintn v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: roundeven_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    frintn v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: roundeven_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    frintn v1.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = call <3 x float> @llvm.roundeven.v3f32(<3 x float> %a)
   ret <3 x float> %c
@@ -1060,27 +1183,52 @@ define <7 x half> @roundeven_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: roundeven_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    frintn v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    frintn v2.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    frintn v0.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    frintn v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[4], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[5], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[6], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v3.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v3.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v3.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v3.h[6]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: roundeven_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    frintn v0.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov v1.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT:    mov v1.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT:    mov v1.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT:    frintn v1.8h, v1.8h
+; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.roundeven.v7f16(<7 x half> %a)
@@ -1280,6 +1428,7 @@ define <3 x double> @rint_v3f64(<3 x double> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    frintx d2, d2
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    frintx v0.2d, v0.2d
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -1311,10 +1460,21 @@ entry:
 }
 
 define <3 x float> @rint_v3f32(<3 x float> %a) {
-; CHECK-LABEL: rint_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    frintx v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: rint_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    frintx v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rint_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    frintx v1.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = call <3 x float> @llvm.rint.v3f32(<3 x float> %a)
   ret <3 x float> %c
@@ -1359,27 +1519,52 @@ define <7 x half> @rint_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: rint_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    frintx v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    frintx v2.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    frintx v0.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    frintx v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[4], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[5], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[6], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v3.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v3.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v3.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v3.h[6]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: rint_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    frintx v0.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov v1.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT:    mov v1.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT:    mov v1.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT:    frintx v1.8h, v1.8h
+; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.rint.v7f16(<7 x half> %a)
@@ -1579,6 +1764,7 @@ define <3 x double> @round_v3f64(<3 x double> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    frinta d2, d2
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    frinta v0.2d, v0.2d
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -1610,10 +1796,21 @@ entry:
 }
 
 define <3 x float> @round_v3f32(<3 x float> %a) {
-; CHECK-LABEL: round_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    frinta v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: round_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    frinta v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: round_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    frinta v1.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = call <3 x float> @llvm.round.v3f32(<3 x float> %a)
   ret <3 x float> %c
@@ -1658,27 +1855,52 @@ define <7 x half> @round_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: round_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    frinta v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    frinta v2.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    frinta v0.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    frinta v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[4], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[5], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[6], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v3.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v3.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v3.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v3.h[6]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: round_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    frinta v0.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov v1.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT:    mov v1.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT:    mov v1.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT:    frinta v1.8h, v1.8h
+; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.round.v7f16(<7 x half> %a)
@@ -1878,6 +2100,7 @@ define <3 x double> @trunc_v3f64(<3 x double> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    frintz d2, d2
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    frintz v0.2d, v0.2d
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -1909,10 +2132,21 @@ entry:
 }
 
 define <3 x float> @trunc_v3f32(<3 x float> %a) {
-; CHECK-LABEL: trunc_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    frintz v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: trunc_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    frintz v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: trunc_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    frintz v1.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = call <3 x float> @llvm.trunc.v3f32(<3 x float> %a)
   ret <3 x float> %c
@@ -1957,27 +2191,52 @@ define <7 x half> @trunc_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: trunc_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    frintz v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    frintz v2.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    frintz v0.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    frintz v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[4], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[5], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[6], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v3.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v3.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v3.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v3.h[6]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: trunc_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    frintz v0.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov v1.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT:    mov v1.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT:    mov v1.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT:    frintz v1.8h, v1.8h
+; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.trunc.v7f16(<7 x half> %a)
diff --git a/llvm/test/CodeGen/AArch64/fdiv.ll b/llvm/test/CodeGen/AArch64/fdiv.ll
index 5bdccccc62b99c..9acd0166fcaa85 100644
--- a/llvm/test/CodeGen/AArch64/fdiv.ll
+++ b/llvm/test/CodeGen/AArch64/fdiv.ll
@@ -93,6 +93,7 @@ define <3 x double> @fdiv_v3f64(<3 x double> %a, <3 x double> %b) {
 ; CHECK-GI-NEXT:    fdiv d2, d2, d5
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    fdiv v0.2d, v0.2d, v3.2d
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -130,10 +131,24 @@ entry:
 }
 
 define <3 x float> @fdiv_v3f32(<3 x float> %a, <3 x float> %b) {
-; CHECK-LABEL: fdiv_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fdiv v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fdiv_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fdiv v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fdiv_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT:    fdiv v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fdiv <3 x float> %a, %b
   ret <3 x float> %c
@@ -186,32 +201,68 @@ define <7 x half> @fdiv_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fdiv_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    fdiv v2.4s, v2.4s, v3.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    fdiv v1.4s, v0.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fdiv v0.4s, v0.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[4], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[5], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[6], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fdiv_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fdiv v0.8h, v0.8h, v1.8h
+; CHECK-GI-FP16-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT:    mov v2.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT:    mov v3.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT:    mov v2.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT:    mov v3.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT:    mov v2.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT:    mov v3.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT:    fdiv v1.8h, v2.8h, v3.8h
+; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fdiv <7 x half> %a, %b
diff --git a/llvm/test/CodeGen/AArch64/fexplog.ll b/llvm/test/CodeGen/AArch64/fexplog.ll
index f13e2fcd1c4483..6072a2c56a06d1 100644
--- a/llvm/test/CodeGen/AArch64/fexplog.ll
+++ b/llvm/test/CodeGen/AArch64/fexplog.ll
@@ -139,29 +139,33 @@ define <3 x double> @exp_v3f64(<3 x double> %a) {
 ;
 ; CHECK-GI-LABEL: exp_v3f64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str d10, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    stp d9, d8, [sp, #8] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    str x30, [sp, #24] // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-GI-NEXT:    .cfi_offset w30, -8
-; CHECK-GI-NEXT:    .cfi_offset b8, -16
-; CHECK-GI-NEXT:    .cfi_offset b9, -24
-; CHECK-GI-NEXT:    .cfi_offset b10, -32
+; CHECK-GI-NEXT:    sub sp, sp, #64
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
 ; CHECK-GI-NEXT:    fmov d8, d1
 ; CHECK-GI-NEXT:    fmov d9, d2
 ; CHECK-GI-NEXT:    bl exp
-; CHECK-GI-NEXT:    fmov d10, d0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov d0, d8
 ; CHECK-GI-NEXT:    bl exp
-; CHECK-GI-NEXT:    fmov d8, d0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov d0, d9
 ; CHECK-GI-NEXT:    bl exp
-; CHECK-GI-NEXT:    fmov d1, d8
-; CHECK-GI-NEXT:    ldp d9, d8, [sp, #8] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #24] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q1, q3, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fmov d2, d0
-; CHECK-GI-NEXT:    fmov d0, d10
-; CHECK-GI-NEXT:    ldr d10, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT:    mov d1, v3.d[1]
+; CHECK-GI-NEXT:    fmov d0, d3
+; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call <3 x double> @llvm.exp.v3f64(<3 x double> %a)
@@ -355,7 +359,9 @@ define <3 x float> @exp_v3f32(<3 x float> %a) {
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -726,7 +732,13 @@ define <7 x half> @exp_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1442,29 +1454,33 @@ define <3 x double> @exp2_v3f64(<3 x double> %a) {
 ;
 ; CHECK-GI-LABEL: exp2_v3f64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str d10, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    stp d9, d8, [sp, #8] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    str x30, [sp, #24] // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-GI-NEXT:    .cfi_offset w30, -8
-; CHECK-GI-NEXT:    .cfi_offset b8, -16
-; CHECK-GI-NEXT:    .cfi_offset b9, -24
-; CHECK-GI-NEXT:    .cfi_offset b10, -32
+; CHECK-GI-NEXT:    sub sp, sp, #64
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
 ; CHECK-GI-NEXT:    fmov d8, d1
 ; CHECK-GI-NEXT:    fmov d9, d2
 ; CHECK-GI-NEXT:    bl exp2
-; CHECK-GI-NEXT:    fmov d10, d0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov d0, d8
 ; CHECK-GI-NEXT:    bl exp2
-; CHECK-GI-NEXT:    fmov d8, d0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov d0, d9
 ; CHECK-GI-NEXT:    bl exp2
-; CHECK-GI-NEXT:    fmov d1, d8
-; CHECK-GI-NEXT:    ldp d9, d8, [sp, #8] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #24] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q1, q3, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fmov d2, d0
-; CHECK-GI-NEXT:    fmov d0, d10
-; CHECK-GI-NEXT:    ldr d10, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT:    mov d1, v3.d[1]
+; CHECK-GI-NEXT:    fmov d0, d3
+; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call <3 x double> @llvm.exp2.v3f64(<3 x double> %a)
@@ -1658,7 +1674,9 @@ define <3 x float> @exp2_v3f32(<3 x float> %a) {
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -2029,7 +2047,13 @@ define <7 x half> @exp2_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -2745,29 +2769,33 @@ define <3 x double> @log_v3f64(<3 x double> %a) {
 ;
 ; CHECK-GI-LABEL: log_v3f64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str d10, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    stp d9, d8, [sp, #8] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    str x30, [sp, #24] // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-GI-NEXT:    .cfi_offset w30, -8
-; CHECK-GI-NEXT:    .cfi_offset b8, -16
-; CHECK-GI-NEXT:    .cfi_offset b9, -24
-; CHECK-GI-NEXT:    .cfi_offset b10, -32
+; CHECK-GI-NEXT:    sub sp, sp, #64
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
 ; CHECK-GI-NEXT:    fmov d8, d1
 ; CHECK-GI-NEXT:    fmov d9, d2
 ; CHECK-GI-NEXT:    bl log
-; CHECK-GI-NEXT:    fmov d10, d0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov d0, d8
 ; CHECK-GI-NEXT:    bl log
-; CHECK-GI-NEXT:    fmov d8, d0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov d0, d9
 ; CHECK-GI-NEXT:    bl log
-; CHECK-GI-NEXT:    fmov d1, d8
-; CHECK-GI-NEXT:    ldp d9, d8, [sp, #8] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #24] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q1, q3, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fmov d2, d0
-; CHECK-GI-NEXT:    fmov d0, d10
-; CHECK-GI-NEXT:    ldr d10, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT:    mov d1, v3.d[1]
+; CHECK-GI-NEXT:    fmov d0, d3
+; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call <3 x double> @llvm.log.v3f64(<3 x double> %a)
@@ -2961,7 +2989,9 @@ define <3 x float> @log_v3f32(<3 x float> %a) {
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -3332,7 +3362,13 @@ define <7 x half> @log_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -4048,29 +4084,33 @@ define <3 x double> @log2_v3f64(<3 x double> %a) {
 ;
 ; CHECK-GI-LABEL: log2_v3f64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str d10, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    stp d9, d8, [sp, #8] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    str x30, [sp, #24] // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-GI-NEXT:    .cfi_offset w30, -8
-; CHECK-GI-NEXT:    .cfi_offset b8, -16
-; CHECK-GI-NEXT:    .cfi_offset b9, -24
-; CHECK-GI-NEXT:    .cfi_offset b10, -32
+; CHECK-GI-NEXT:    sub sp, sp, #64
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
 ; CHECK-GI-NEXT:    fmov d8, d1
 ; CHECK-GI-NEXT:    fmov d9, d2
 ; CHECK-GI-NEXT:    bl log2
-; CHECK-GI-NEXT:    fmov d10, d0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov d0, d8
 ; CHECK-GI-NEXT:    bl log2
-; CHECK-GI-NEXT:    fmov d8, d0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov d0, d9
 ; CHECK-GI-NEXT:    bl log2
-; CHECK-GI-NEXT:    fmov d1, d8
-; CHECK-GI-NEXT:    ldp d9, d8, [sp, #8] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #24] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q1, q3, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fmov d2, d0
-; CHECK-GI-NEXT:    fmov d0, d10
-; CHECK-GI-NEXT:    ldr d10, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT:    mov d1, v3.d[1]
+; CHECK-GI-NEXT:    fmov d0, d3
+; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call <3 x double> @llvm.log2.v3f64(<3 x double> %a)
@@ -4264,7 +4304,9 @@ define <3 x float> @log2_v3f32(<3 x float> %a) {
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -4635,7 +4677,13 @@ define <7 x half> @log2_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -5351,29 +5399,33 @@ define <3 x double> @log10_v3f64(<3 x double> %a) {
 ;
 ; CHECK-GI-LABEL: log10_v3f64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str d10, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    stp d9, d8, [sp, #8] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    str x30, [sp, #24] // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-GI-NEXT:    .cfi_offset w30, -8
-; CHECK-GI-NEXT:    .cfi_offset b8, -16
-; CHECK-GI-NEXT:    .cfi_offset b9, -24
-; CHECK-GI-NEXT:    .cfi_offset b10, -32
+; CHECK-GI-NEXT:    sub sp, sp, #64
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
 ; CHECK-GI-NEXT:    fmov d8, d1
 ; CHECK-GI-NEXT:    fmov d9, d2
 ; CHECK-GI-NEXT:    bl log10
-; CHECK-GI-NEXT:    fmov d10, d0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov d0, d8
 ; CHECK-GI-NEXT:    bl log10
-; CHECK-GI-NEXT:    fmov d8, d0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov d0, d9
 ; CHECK-GI-NEXT:    bl log10
-; CHECK-GI-NEXT:    fmov d1, d8
-; CHECK-GI-NEXT:    ldp d9, d8, [sp, #8] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #24] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q1, q3, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fmov d2, d0
-; CHECK-GI-NEXT:    fmov d0, d10
-; CHECK-GI-NEXT:    ldr d10, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT:    mov d1, v3.d[1]
+; CHECK-GI-NEXT:    fmov d0, d3
+; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call <3 x double> @llvm.log10.v3f64(<3 x double> %a)
@@ -5567,7 +5619,9 @@ define <3 x float> @log10_v3f32(<3 x float> %a) {
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -5938,7 +5992,13 @@ define <7 x half> @log10_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
index bbfec8c7c33617..83b6f3c26f34c6 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
@@ -16,8 +16,12 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec
 ;
 ; CHECK-GI-LABEL: vector_deinterleave_v2f16_v4f16:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    uzp1 v2.4h, v0.4h, v0.4h
-; CHECK-GI-NEXT:    uzp2 v1.4h, v0.4h, v0.4h
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[1]
+; CHECK-GI-NEXT:    mov v2.h[1], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[3]
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 killed $q1
 ; CHECK-GI-NEXT:    fmov d0, d2
 ; CHECK-GI-NEXT:    ret
   %retval = call {<2 x half>, <2 x half>} @llvm.vector.deinterleave2.v4f16(<4 x half> %vec)
diff --git a/llvm/test/CodeGen/AArch64/fminimummaximum.ll b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
index fb12f8acf17453..c2e91a9956af91 100644
--- a/llvm/test/CodeGen/AArch64/fminimummaximum.ll
+++ b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
@@ -154,6 +154,7 @@ define <3 x double> @min_v3f64(<3 x double> %a, <3 x double> %b) {
 ; CHECK-GI-NEXT:    fmin d2, d2, d5
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    fmin v0.2d, v0.2d, v3.2d
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -191,6 +192,7 @@ define <3 x double> @max_v3f64(<3 x double> %a, <3 x double> %b) {
 ; CHECK-GI-NEXT:    fmax d2, d2, d5
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    fmax v0.2d, v0.2d, v3.2d
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -255,20 +257,48 @@ entry:
 }
 
 define <3 x float> @min_v3f32(<3 x float> %a, <3 x float> %b) {
-; CHECK-LABEL: min_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: min_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmin v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: min_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT:    fmin v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b)
   ret <3 x float> %c
 }
 
 define <3 x float> @max_v3f32(<3 x float> %a, <3 x float> %b) {
-; CHECK-LABEL: max_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: max_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmax v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: max_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT:    fmax v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b)
   ret <3 x float> %c
@@ -662,32 +692,68 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-NOFP16-GI-LABEL: min_v7f16:
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
-; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v0.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-NOFP16-GI-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[0], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[0], v0.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-NOFP16-GI-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v0.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    fmin v2.4s, v2.4s, v3.4s
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[0], v1.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v0.h[5]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[1], v1.h[5]
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[0], v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
-; CHECK-NOFP16-GI-NEXT:    fmin v1.4s, v1.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v2.h[2]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v2.h[3]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v1.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[2]
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT:    fmin v0.4s, v0.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[2], v2.h[2]
+; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[3], v2.h[3]
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[4], v0.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[5], v0.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[6], v0.h[2]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: min_v7f16:
 ; CHECK-FP16-GI:       // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT:    fmin v0.8h, v0.8h, v1.8h
+; CHECK-FP16-GI-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-FP16-GI-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-FP16-GI-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-FP16-GI-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-FP16-GI-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-FP16-GI-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-FP16-GI-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-FP16-GI-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-FP16-GI-NEXT:    mov v2.h[4], v0.h[4]
+; CHECK-FP16-GI-NEXT:    mov v3.h[4], v1.h[4]
+; CHECK-FP16-GI-NEXT:    mov v2.h[5], v0.h[5]
+; CHECK-FP16-GI-NEXT:    mov v3.h[5], v1.h[5]
+; CHECK-FP16-GI-NEXT:    mov v2.h[6], v0.h[6]
+; CHECK-FP16-GI-NEXT:    mov v3.h[6], v1.h[6]
+; CHECK-FP16-GI-NEXT:    fmin v1.8h, v2.8h, v3.8h
+; CHECK-FP16-GI-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-FP16-GI-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-FP16-GI-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-FP16-GI-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-FP16-GI-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-FP16-GI-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-FP16-GI-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.minimum.v7f16(<7 x half> %a, <7 x half> %b)
@@ -760,32 +826,68 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-NOFP16-GI-LABEL: max_v7f16:
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
-; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v0.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-NOFP16-GI-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[0], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[0], v0.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-NOFP16-GI-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v0.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    fmax v2.4s, v2.4s, v3.4s
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[0], v1.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v0.h[5]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[1], v1.h[5]
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[0], v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
-; CHECK-NOFP16-GI-NEXT:    fmax v1.4s, v1.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v2.h[2]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v2.h[3]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v1.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[2]
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT:    fmax v0.4s, v0.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[2], v2.h[2]
+; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[3], v2.h[3]
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[4], v0.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[5], v0.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[6], v0.h[2]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: max_v7f16:
 ; CHECK-FP16-GI:       // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT:    fmax v0.8h, v0.8h, v1.8h
+; CHECK-FP16-GI-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-FP16-GI-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-FP16-GI-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-FP16-GI-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-FP16-GI-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-FP16-GI-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-FP16-GI-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-FP16-GI-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-FP16-GI-NEXT:    mov v2.h[4], v0.h[4]
+; CHECK-FP16-GI-NEXT:    mov v3.h[4], v1.h[4]
+; CHECK-FP16-GI-NEXT:    mov v2.h[5], v0.h[5]
+; CHECK-FP16-GI-NEXT:    mov v3.h[5], v1.h[5]
+; CHECK-FP16-GI-NEXT:    mov v2.h[6], v0.h[6]
+; CHECK-FP16-GI-NEXT:    mov v3.h[6], v1.h[6]
+; CHECK-FP16-GI-NEXT:    fmax v1.8h, v2.8h, v3.8h
+; CHECK-FP16-GI-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-FP16-GI-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-FP16-GI-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-FP16-GI-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-FP16-GI-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-FP16-GI-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-FP16-GI-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.maximum.v7f16(<7 x half> %a, <7 x half> %b)
diff --git a/llvm/test/CodeGen/AArch64/fminmax.ll b/llvm/test/CodeGen/AArch64/fminmax.ll
index 64f0da8b4cd0f9..b7af6be8721d68 100644
--- a/llvm/test/CodeGen/AArch64/fminmax.ll
+++ b/llvm/test/CodeGen/AArch64/fminmax.ll
@@ -154,6 +154,7 @@ define <3 x double> @min_v3f64(<3 x double> %a, <3 x double> %b) {
 ; CHECK-GI-NEXT:    fminnm d2, d2, d5
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    fminnm v0.2d, v0.2d, v3.2d
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -191,6 +192,7 @@ define <3 x double> @max_v3f64(<3 x double> %a, <3 x double> %b) {
 ; CHECK-GI-NEXT:    fmaxnm d2, d2, d5
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    fmaxnm v0.2d, v0.2d, v3.2d
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -255,20 +257,48 @@ entry:
 }
 
 define <3 x float> @min_v3f32(<3 x float> %a, <3 x float> %b) {
-; CHECK-LABEL: min_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fminnm v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: min_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fminnm v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: min_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT:    fminnm v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = call <3 x float> @llvm.minnum.v3f32(<3 x float> %a, <3 x float> %b)
   ret <3 x float> %c
 }
 
 define <3 x float> @max_v3f32(<3 x float> %a, <3 x float> %b) {
-; CHECK-LABEL: max_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: max_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: max_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT:    fmaxnm v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = call <3 x float> @llvm.maxnum.v3f32(<3 x float> %a, <3 x float> %b)
   ret <3 x float> %c
@@ -662,32 +692,68 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-NOFP16-GI-LABEL: min_v7f16:
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
-; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v0.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-NOFP16-GI-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[0], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[0], v0.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-NOFP16-GI-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v0.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    fminnm v2.4s, v2.4s, v3.4s
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[0], v1.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v0.h[5]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[1], v1.h[5]
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[0], v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
-; CHECK-NOFP16-GI-NEXT:    fminnm v1.4s, v1.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v2.h[2]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v2.h[3]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v1.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[2]
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT:    fminnm v0.4s, v0.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[2], v2.h[2]
+; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[3], v2.h[3]
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[4], v0.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[5], v0.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[6], v0.h[2]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: min_v7f16:
 ; CHECK-FP16-GI:       // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT:    fminnm v0.8h, v0.8h, v1.8h
+; CHECK-FP16-GI-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-FP16-GI-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-FP16-GI-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-FP16-GI-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-FP16-GI-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-FP16-GI-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-FP16-GI-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-FP16-GI-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-FP16-GI-NEXT:    mov v2.h[4], v0.h[4]
+; CHECK-FP16-GI-NEXT:    mov v3.h[4], v1.h[4]
+; CHECK-FP16-GI-NEXT:    mov v2.h[5], v0.h[5]
+; CHECK-FP16-GI-NEXT:    mov v3.h[5], v1.h[5]
+; CHECK-FP16-GI-NEXT:    mov v2.h[6], v0.h[6]
+; CHECK-FP16-GI-NEXT:    mov v3.h[6], v1.h[6]
+; CHECK-FP16-GI-NEXT:    fminnm v1.8h, v2.8h, v3.8h
+; CHECK-FP16-GI-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-FP16-GI-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-FP16-GI-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-FP16-GI-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-FP16-GI-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-FP16-GI-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-FP16-GI-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.minnum.v7f16(<7 x half> %a, <7 x half> %b)
@@ -760,32 +826,68 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-NOFP16-GI-LABEL: max_v7f16:
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
-; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v0.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-NOFP16-GI-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[0], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[0], v0.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-NOFP16-GI-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v0.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    fmaxnm v2.4s, v2.4s, v3.4s
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[0], v1.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v0.h[5]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[1], v1.h[5]
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[0], v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
-; CHECK-NOFP16-GI-NEXT:    fmaxnm v1.4s, v1.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v2.h[2]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v2.h[3]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v1.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[2]
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT:    fmaxnm v0.4s, v0.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[2], v2.h[2]
+; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[3], v2.h[3]
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[4], v0.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[5], v0.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v1.h[6], v0.h[2]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: max_v7f16:
 ; CHECK-FP16-GI:       // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT:    fmaxnm v0.8h, v0.8h, v1.8h
+; CHECK-FP16-GI-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-FP16-GI-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-FP16-GI-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-FP16-GI-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-FP16-GI-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-FP16-GI-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-FP16-GI-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-FP16-GI-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-FP16-GI-NEXT:    mov v2.h[4], v0.h[4]
+; CHECK-FP16-GI-NEXT:    mov v3.h[4], v1.h[4]
+; CHECK-FP16-GI-NEXT:    mov v2.h[5], v0.h[5]
+; CHECK-FP16-GI-NEXT:    mov v3.h[5], v1.h[5]
+; CHECK-FP16-GI-NEXT:    mov v2.h[6], v0.h[6]
+; CHECK-FP16-GI-NEXT:    mov v3.h[6], v1.h[6]
+; CHECK-FP16-GI-NEXT:    fmaxnm v1.8h, v2.8h, v3.8h
+; CHECK-FP16-GI-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-FP16-GI-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-FP16-GI-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-FP16-GI-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-FP16-GI-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-FP16-GI-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-FP16-GI-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.maxnum.v7f16(<7 x half> %a, <7 x half> %b)
diff --git a/llvm/test/CodeGen/AArch64/fmla.ll b/llvm/test/CodeGen/AArch64/fmla.ll
index 7bcaae5a77eac5..0a9d4c7b657e06 100644
--- a/llvm/test/CodeGen/AArch64/fmla.ll
+++ b/llvm/test/CodeGen/AArch64/fmla.ll
@@ -105,6 +105,7 @@ define <3 x double> @fma_v3f64(<3 x double> %a, <3 x double> %b, <3 x double> %c
 ; CHECK-GI-NEXT:    fmla v6.2d, v3.2d, v0.2d
 ; CHECK-GI-NEXT:    ldr d0, [sp]
 ; CHECK-GI-NEXT:    fmadd d2, d2, d5, d0
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    mov d1, v6.d[1]
 ; CHECK-GI-NEXT:    fmov d0, d6
 ; CHECK-GI-NEXT:    ret
@@ -138,11 +139,28 @@ entry:
 }
 
 define <3 x float> @fma_v3f32(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
-; CHECK-LABEL: fma_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmla v2.4s, v1.4s, v0.4s
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fma_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmla v2.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fma_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v3.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v4.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v5.s[0], v2.s[0]
+; CHECK-GI-NEXT:    mov v3.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v4.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v5.s[1], v2.s[1]
+; CHECK-GI-NEXT:    mov v3.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v4.s[2], v1.s[2]
+; CHECK-GI-NEXT:    mov v5.s[2], v2.s[2]
+; CHECK-GI-NEXT:    fmla v5.4s, v4.4s, v3.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v5.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v5.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v5.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %d = call <3 x float> @llvm.fma.v3f32(<3 x float> %a, <3 x float> %b, <3 x float> %c)
   ret <3 x float> %d
@@ -254,38 +272,84 @@ define <7 x half> @fma_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fma_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[0], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v6.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v6.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v5.4h
 ; CHECK-GI-NOFP16-NEXT:    fmla v5.4s, v4.4s, v3.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[4]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v2.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v2.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v5.4h, v5.4s
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v2.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v6.4h
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v5.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v5.h[1]
-; CHECK-GI-NOFP16-NEXT:    fmla v3.4s, v2.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v5.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v5.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v5.h[1]
+; CHECK-GI-NOFP16-NEXT:    fmla v3.4s, v2.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v5.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v5.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[4], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[5], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[6], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fma_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fmla v2.8h, v1.8h, v0.8h
-; CHECK-GI-FP16-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-FP16-NEXT:    mov v3.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v4.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v5.h[0], v2.h[0]
+; CHECK-GI-FP16-NEXT:    mov v3.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v4.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v5.h[1], v2.h[1]
+; CHECK-GI-FP16-NEXT:    mov v3.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov v4.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v5.h[2], v2.h[2]
+; CHECK-GI-FP16-NEXT:    mov v3.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov v4.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT:    mov v5.h[3], v2.h[3]
+; CHECK-GI-FP16-NEXT:    mov v3.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT:    mov v4.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT:    mov v5.h[4], v2.h[4]
+; CHECK-GI-FP16-NEXT:    mov v3.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT:    mov v4.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT:    mov v5.h[5], v2.h[5]
+; CHECK-GI-FP16-NEXT:    mov v3.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT:    mov v4.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT:    mov v5.h[6], v2.h[6]
+; CHECK-GI-FP16-NEXT:    fmla v5.8h, v4.8h, v3.8h
+; CHECK-GI-FP16-NEXT:    mov v0.h[0], v5.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v5.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], v5.h[2]
+; CHECK-GI-FP16-NEXT:    mov v0.h[3], v5.h[3]
+; CHECK-GI-FP16-NEXT:    mov v0.h[4], v5.h[4]
+; CHECK-GI-FP16-NEXT:    mov v0.h[5], v5.h[5]
+; CHECK-GI-FP16-NEXT:    mov v0.h[6], v5.h[6]
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %d = call <7 x half> @llvm.fma.v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c)
@@ -756,6 +820,7 @@ define <3 x double> @fmuladd_v3f64(<3 x double> %a, <3 x double> %b, <3 x double
 ; CHECK-GI-NEXT:    fmla v6.2d, v3.2d, v0.2d
 ; CHECK-GI-NEXT:    ldr d0, [sp]
 ; CHECK-GI-NEXT:    fmadd d2, d2, d5, d0
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    mov d1, v6.d[1]
 ; CHECK-GI-NEXT:    fmov d0, d6
 ; CHECK-GI-NEXT:    ret
@@ -789,11 +854,28 @@ entry:
 }
 
 define <3 x float> @fmuladd_v3f32(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
-; CHECK-LABEL: fmuladd_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmla v2.4s, v1.4s, v0.4s
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fmuladd_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmla v2.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fmuladd_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v3.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v4.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v5.s[0], v2.s[0]
+; CHECK-GI-NEXT:    mov v3.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v4.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v5.s[1], v2.s[1]
+; CHECK-GI-NEXT:    mov v3.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v4.s[2], v1.s[2]
+; CHECK-GI-NEXT:    mov v5.s[2], v2.s[2]
+; CHECK-GI-NEXT:    fmla v5.4s, v4.4s, v3.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v5.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v5.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v5.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %d = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %a, <3 x float> %b, <3 x float> %c)
   ret <3 x float> %d
@@ -852,44 +934,90 @@ define <7 x half> @fmuladd_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fmuladd_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    fmul v3.4s, v3.4s, v4.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v6.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v5.h[0], v2.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    fadd v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v2.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fmul v1.4s, v3.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v2.h[6]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v5.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
-; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v1.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v3.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[4], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[6], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[6]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fmuladd_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fmla v2.8h, v1.8h, v0.8h
-; CHECK-GI-FP16-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-FP16-NEXT:    mov v3.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v4.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v5.h[0], v2.h[0]
+; CHECK-GI-FP16-NEXT:    mov v3.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v4.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v5.h[1], v2.h[1]
+; CHECK-GI-FP16-NEXT:    mov v3.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov v4.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v5.h[2], v2.h[2]
+; CHECK-GI-FP16-NEXT:    mov v3.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov v4.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT:    mov v5.h[3], v2.h[3]
+; CHECK-GI-FP16-NEXT:    mov v3.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT:    mov v4.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT:    mov v5.h[4], v2.h[4]
+; CHECK-GI-FP16-NEXT:    mov v3.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT:    mov v4.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT:    mov v5.h[5], v2.h[5]
+; CHECK-GI-FP16-NEXT:    mov v3.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT:    mov v4.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT:    mov v5.h[6], v2.h[6]
+; CHECK-GI-FP16-NEXT:    fmla v5.8h, v4.8h, v3.8h
+; CHECK-GI-FP16-NEXT:    mov v0.h[0], v5.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v5.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], v5.h[2]
+; CHECK-GI-FP16-NEXT:    mov v0.h[3], v5.h[3]
+; CHECK-GI-FP16-NEXT:    mov v0.h[4], v5.h[4]
+; CHECK-GI-FP16-NEXT:    mov v0.h[5], v5.h[5]
+; CHECK-GI-FP16-NEXT:    mov v0.h[6], v5.h[6]
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %d = call <7 x half> @llvm.fmuladd.v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c)
@@ -1204,6 +1332,7 @@ define <3 x double> @fmul_v3f64(<3 x double> %a, <3 x double> %b, <3 x double> %
 ; CHECK-GI-NEXT:    fmla v6.2d, v0.2d, v3.2d
 ; CHECK-GI-NEXT:    ldr d0, [sp]
 ; CHECK-GI-NEXT:    fmadd d2, d2, d5, d0
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    mov d1, v6.d[1]
 ; CHECK-GI-NEXT:    fmov d0, d6
 ; CHECK-GI-NEXT:    ret
@@ -1262,8 +1391,19 @@ define <3 x float> @fmul_v3f32(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
 ;
 ; CHECK-GI-LABEL: fmul_v3f32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmla v2.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    mov v3.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v4.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v5.s[0], v2.s[0]
+; CHECK-GI-NEXT:    mov v3.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v4.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v5.s[1], v2.s[1]
+; CHECK-GI-NEXT:    mov v3.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v4.s[2], v1.s[2]
+; CHECK-GI-NEXT:    mov v5.s[2], v2.s[2]
+; CHECK-GI-NEXT:    fmla v5.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v5.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v5.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v5.s[2]
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = fmul fast <3 x float> %a, %b
@@ -1340,44 +1480,90 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fmul_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    fmul v3.4s, v3.4s, v4.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v6.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v5.h[0], v2.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    fadd v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v2.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fmul v1.4s, v3.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v2.h[6]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v5.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
-; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v1.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v3.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[4], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[6], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[6]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fmul_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fmla v2.8h, v0.8h, v1.8h
-; CHECK-GI-FP16-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-FP16-NEXT:    mov v3.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v4.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v5.h[0], v2.h[0]
+; CHECK-GI-FP16-NEXT:    mov v3.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v4.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v5.h[1], v2.h[1]
+; CHECK-GI-FP16-NEXT:    mov v3.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov v4.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v5.h[2], v2.h[2]
+; CHECK-GI-FP16-NEXT:    mov v3.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov v4.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT:    mov v5.h[3], v2.h[3]
+; CHECK-GI-FP16-NEXT:    mov v3.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT:    mov v4.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT:    mov v5.h[4], v2.h[4]
+; CHECK-GI-FP16-NEXT:    mov v3.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT:    mov v4.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT:    mov v5.h[5], v2.h[5]
+; CHECK-GI-FP16-NEXT:    mov v3.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT:    mov v4.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT:    mov v5.h[6], v2.h[6]
+; CHECK-GI-FP16-NEXT:    fmla v5.8h, v3.8h, v4.8h
+; CHECK-GI-FP16-NEXT:    mov v0.h[0], v5.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v5.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], v5.h[2]
+; CHECK-GI-FP16-NEXT:    mov v0.h[3], v5.h[3]
+; CHECK-GI-FP16-NEXT:    mov v0.h[4], v5.h[4]
+; CHECK-GI-FP16-NEXT:    mov v0.h[5], v5.h[5]
+; CHECK-GI-FP16-NEXT:    mov v0.h[6], v5.h[6]
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %d = fmul fast <7 x half> %a, %b
diff --git a/llvm/test/CodeGen/AArch64/fmul.ll b/llvm/test/CodeGen/AArch64/fmul.ll
index bd3d1353e643e5..de6618ac18f157 100644
--- a/llvm/test/CodeGen/AArch64/fmul.ll
+++ b/llvm/test/CodeGen/AArch64/fmul.ll
@@ -93,6 +93,7 @@ define <3 x double> @fmul_v3f64(<3 x double> %a, <3 x double> %b) {
 ; CHECK-GI-NEXT:    fmul d2, d2, d5
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    fmul v0.2d, v0.2d, v3.2d
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -130,10 +131,24 @@ entry:
 }
 
 define <3 x float> @fmul_v3f32(<3 x float> %a, <3 x float> %b) {
-; CHECK-LABEL: fmul_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmul v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fmul_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fmul_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT:    fmul v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fmul <3 x float> %a, %b
   ret <3 x float> %c
@@ -186,32 +201,68 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fmul_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    fmul v2.4s, v2.4s, v3.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    fmul v1.4s, v1.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    fmul v0.4s, v0.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[4], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[5], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[6], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fmul_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fmul v0.8h, v0.8h, v1.8h
+; CHECK-GI-FP16-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT:    mov v2.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT:    mov v3.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT:    mov v2.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT:    mov v3.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT:    mov v2.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT:    mov v3.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT:    fmul v1.8h, v2.8h, v3.8h
+; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fmul <7 x half> %a, %b
diff --git a/llvm/test/CodeGen/AArch64/fneg.ll b/llvm/test/CodeGen/AArch64/fneg.ll
index de2671afe60ab7..dd6266e8b3b1f4 100644
--- a/llvm/test/CodeGen/AArch64/fneg.ll
+++ b/llvm/test/CodeGen/AArch64/fneg.ll
@@ -88,6 +88,7 @@ define <3 x double> @fabs_v3f64(<3 x double> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    fneg d2, d2
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    fneg v0.2d, v0.2d
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -119,10 +120,21 @@ entry:
 }
 
 define <3 x float> @fabs_v3f32(<3 x float> %a) {
-; CHECK-LABEL: fabs_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fneg v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fabs_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fneg v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fabs_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    fneg v1.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fneg <3 x float> %a
   ret <3 x float> %c
@@ -163,13 +175,41 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fabs_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    movi v1.8h, #128, lsl #8
-; CHECK-GI-NOFP16-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[4], v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[5], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[6], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v2.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fabs_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fneg v0.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov v1.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT:    mov v1.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT:    mov v1.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT:    fneg v1.8h, v1.8h
+; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fneg <7 x half> %a
diff --git a/llvm/test/CodeGen/AArch64/fpext.ll b/llvm/test/CodeGen/AArch64/fpext.ll
index df90f9d5f09109..7a30b68be6eae2 100644
--- a/llvm/test/CodeGen/AArch64/fpext.ll
+++ b/llvm/test/CodeGen/AArch64/fpext.ll
@@ -82,9 +82,12 @@ define <3 x double> @fpext_v3f32_v3f64(<3 x float> %a) {
 ;
 ; CHECK-GI-LABEL: fpext_v3f32_v3f64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s1, v0.s[2]
-; CHECK-GI-NEXT:    fcvtl v0.2d, v0.2s
-; CHECK-GI-NEXT:    fcvt d2, s1
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    fcvt d2, s2
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    fcvtl v0.2d, v1.2s
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
@@ -355,10 +358,14 @@ define <3 x double> @fpext_v3f16_v3f64(<3 x half> %a) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    fcvt d0, h0
+; CHECK-GI-NEXT:    fcvt d3, h0
+; CHECK-GI-NEXT:    mov h0, v0.h[2]
 ; CHECK-GI-NEXT:    fcvt d1, h1
-; CHECK-GI-NEXT:    fcvt d2, h2
+; CHECK-GI-NEXT:    fcvt d2, h0
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT:    mov d1, v3.d[1]
+; CHECK-GI-NEXT:    fmov d0, d3
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = fpext <3 x half> %a to <3 x double>
@@ -403,10 +410,22 @@ entry:
 }
 
 define <3 x float> @fpext_v3f16_v3f32(<3 x half> %a) {
-; CHECK-LABEL: fpext_v3f16_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fpext_v3f16_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fpext_v3f16_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fpext <3 x half> %a to <3 x float>
   ret <3 x float> %c
diff --git a/llvm/test/CodeGen/AArch64/fpow.ll b/llvm/test/CodeGen/AArch64/fpow.ll
index dc93d5be9b3f38..fb7efe82582322 100644
--- a/llvm/test/CodeGen/AArch64/fpow.ll
+++ b/llvm/test/CodeGen/AArch64/fpow.ll
@@ -156,38 +156,42 @@ define <3 x double> @pow_v3f64(<3 x double> %a, <3 x double> %b) {
 ;
 ; CHECK-GI-LABEL: pow_v3f64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str d12, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    stp d11, d10, [sp, #8] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp d9, d8, [sp, #24] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    str x30, [sp, #40] // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-GI-NEXT:    .cfi_offset w30, -8
-; CHECK-GI-NEXT:    .cfi_offset b8, -16
-; CHECK-GI-NEXT:    .cfi_offset b9, -24
-; CHECK-GI-NEXT:    .cfi_offset b10, -32
-; CHECK-GI-NEXT:    .cfi_offset b11, -40
-; CHECK-GI-NEXT:    .cfi_offset b12, -48
+; CHECK-GI-NEXT:    sub sp, sp, #80
+; CHECK-GI-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    .cfi_offset b10, -40
+; CHECK-GI-NEXT:    .cfi_offset b11, -48
 ; CHECK-GI-NEXT:    fmov d8, d1
 ; CHECK-GI-NEXT:    fmov d1, d3
 ; CHECK-GI-NEXT:    fmov d9, d2
 ; CHECK-GI-NEXT:    fmov d10, d4
 ; CHECK-GI-NEXT:    fmov d11, d5
 ; CHECK-GI-NEXT:    bl pow
-; CHECK-GI-NEXT:    fmov d12, d0
-; CHECK-GI-NEXT:    fmov d0, d8
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov d1, d10
+; CHECK-GI-NEXT:    fmov d0, d8
 ; CHECK-GI-NEXT:    bl pow
-; CHECK-GI-NEXT:    fmov d8, d0
-; CHECK-GI-NEXT:    fmov d0, d9
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov d1, d11
+; CHECK-GI-NEXT:    fmov d0, d9
 ; CHECK-GI-NEXT:    bl pow
-; CHECK-GI-NEXT:    fmov d1, d8
-; CHECK-GI-NEXT:    ldp d9, d8, [sp, #24] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldp d11, d10, [sp, #8] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q1, q3, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fmov d2, d0
-; CHECK-GI-NEXT:    ldr x30, [sp, #40] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    fmov d0, d12
-; CHECK-GI-NEXT:    ldr d12, [sp], #48 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT:    mov d1, v3.d[1]
+; CHECK-GI-NEXT:    fmov d0, d3
+; CHECK-GI-NEXT:    add sp, sp, #80
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call <3 x double> @llvm.pow.v3f64(<3 x double> %a, <3 x double> %b)
@@ -419,7 +423,9 @@ define <3 x float> @pow_v3f32(<3 x float> %a, <3 x float> %b) {
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    add sp, sp, #80
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -879,7 +885,13 @@ define <7 x half> @pow_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NEXT:    ldr q2, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-NEXT:    add sp, sp, #176
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/fpowi.ll b/llvm/test/CodeGen/AArch64/fpowi.ll
index 8948556d1b380a..3f122ee06d99a9 100644
--- a/llvm/test/CodeGen/AArch64/fpowi.ll
+++ b/llvm/test/CodeGen/AArch64/fpowi.ll
@@ -149,33 +149,37 @@ define <3 x double> @powi_v3f64(<3 x double> %a, i32 %b) {
 ;
 ; CHECK-GI-LABEL: powi_v3f64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str d10, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    stp d9, d8, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp x30, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-NEXT:    sub sp, sp, #64
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x30, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
 ; CHECK-GI-NEXT:    .cfi_offset w19, -8
 ; CHECK-GI-NEXT:    .cfi_offset w30, -16
 ; CHECK-GI-NEXT:    .cfi_offset b8, -24
 ; CHECK-GI-NEXT:    .cfi_offset b9, -32
-; CHECK-GI-NEXT:    .cfi_offset b10, -48
 ; CHECK-GI-NEXT:    fmov d8, d1
 ; CHECK-GI-NEXT:    fmov d9, d2
 ; CHECK-GI-NEXT:    mov w19, w0
 ; CHECK-GI-NEXT:    bl __powidf2
-; CHECK-GI-NEXT:    fmov d10, d0
-; CHECK-GI-NEXT:    fmov d0, d8
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    fmov d0, d8
 ; CHECK-GI-NEXT:    bl __powidf2
-; CHECK-GI-NEXT:    fmov d8, d0
-; CHECK-GI-NEXT:    fmov d0, d9
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    fmov d0, d9
 ; CHECK-GI-NEXT:    bl __powidf2
-; CHECK-GI-NEXT:    fmov d1, d8
-; CHECK-GI-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldp d9, d8, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q1, q3, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fmov d2, d0
-; CHECK-GI-NEXT:    fmov d0, d10
-; CHECK-GI-NEXT:    ldr d10, [sp], #48 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x30, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT:    mov d1, v3.d[1]
+; CHECK-GI-NEXT:    fmov d0, d3
+; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call <3 x double> @llvm.powi.v3f64.i32(<3 x double> %a, i32 %b)
@@ -393,7 +397,9 @@ define <3 x float> @powi_v3f32(<3 x float> %a, i32 %b) {
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -809,7 +815,13 @@ define <7 x half> @powi_v7f16(<7 x half> %a, i32 %b) {
 ; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll
index 9c4f0207b84ce8..1ab72b7dc0056f 100644
--- a/llvm/test/CodeGen/AArch64/fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/fptoi.ll
@@ -1015,32 +1015,60 @@ entry:
 }
 
 define <3 x i32> @fptos_v3f64_v3i32(<3 x double> %a) {
-; CHECK-LABEL: fptos_v3f64_v3i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    fcvtzs v1.2d, v2.2d
-; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fptos_v3f64_v3i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    fcvtzs v1.2d, v2.2d
+; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fptos_v3f64_v3i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    fcvtzs v1.2d, v2.2d
+; CHECK-GI-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-GI-NEXT:    uzp1 v1.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fptosi <3 x double> %a to <3 x i32>
   ret <3 x i32> %c
 }
 
 define <3 x i32> @fptou_v3f64_v3i32(<3 x double> %a) {
-; CHECK-LABEL: fptou_v3f64_v3i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    fcvtzu v1.2d, v2.2d
-; CHECK-NEXT:    fcvtzu v0.2d, v0.2d
-; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fptou_v3f64_v3i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    fcvtzu v1.2d, v2.2d
+; CHECK-SD-NEXT:    fcvtzu v0.2d, v0.2d
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fptou_v3f64_v3i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    fcvtzu v1.2d, v2.2d
+; CHECK-GI-NEXT:    fcvtzu v0.2d, v0.2d
+; CHECK-GI-NEXT:    uzp1 v1.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fptoui <3 x double> %a to <3 x i32>
   ret <3 x i32> %c
@@ -1375,17 +1403,33 @@ entry:
 }
 
 define <3 x i16> @fptos_v3f64_v3i16(<3 x double> %a) {
-; CHECK-LABEL: fptos_v3f64_v3i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    fcvtzs v1.2d, v2.2d
-; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fptos_v3f64_v3i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    fcvtzs v1.2d, v2.2d
+; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fptos_v3f64_v3i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    fcvtzs v1.2d, v2.2d
+; CHECK-GI-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fptosi <3 x double> %a to <3 x i16>
   ret <3 x i16> %c
@@ -1413,7 +1457,11 @@ define <3 x i16> @fptou_v3f64_v3i16(<3 x double> %a) {
 ; CHECK-GI-NEXT:    fcvtzu v1.2d, v2.2d
 ; CHECK-GI-NEXT:    fcvtzu v0.2d, v0.2d
 ; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = fptoui <3 x double> %a to <3 x i16>
@@ -1876,15 +1924,18 @@ define <3 x i8> @fptos_v3f64_v3i8(<3 x double> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT:    fcvtzs v1.2d, v2.2d
 ; CHECK-GI-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-GI-NEXT:    fmov x2, d1
-; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 killed $x2
-; CHECK-GI-NEXT:    mov d2, v0.d[1]
-; CHECK-GI-NEXT:    fmov x0, d0
-; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 killed $x0
-; CHECK-GI-NEXT:    fmov x1, d2
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 killed $x1
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    mov v0.s[0], v0.s[0]
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    fcvtzs v1.2d, v2.2d
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    fmov w1, s1
+; CHECK-GI-NEXT:    fmov w2, s2
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = fptosi <3 x double> %a to <3 x i8>
@@ -1913,15 +1964,18 @@ define <3 x i8> @fptou_v3f64_v3i8(<3 x double> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT:    fcvtzu v1.2d, v2.2d
 ; CHECK-GI-NEXT:    fcvtzu v0.2d, v0.2d
-; CHECK-GI-NEXT:    fmov x2, d1
-; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 killed $x2
-; CHECK-GI-NEXT:    mov d2, v0.d[1]
-; CHECK-GI-NEXT:    fmov x0, d0
-; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 killed $x0
-; CHECK-GI-NEXT:    fmov x1, d2
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 killed $x1
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    mov v0.s[0], v0.s[0]
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    fcvtzu v1.2d, v2.2d
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    fmov w1, s1
+; CHECK-GI-NEXT:    fmov w2, s2
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = fptoui <3 x double> %a to <3 x i8>
@@ -2585,14 +2639,16 @@ define <3 x i64> @fptos_v3f32_v3i64(<3 x float> %a) {
 ;
 ; CHECK-GI-LABEL: fptos_v3f32_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov v1.s[0], v0.s[2]
-; CHECK-GI-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v0.s[0], v0.s[2]
 ; CHECK-GI-NEXT:    fcvtl v1.2d, v1.2s
-; CHECK-GI-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-GI-NEXT:    fcvtzs v2.2d, v1.2d
+; CHECK-GI-NEXT:    fcvtl v2.2d, v0.2s
+; CHECK-GI-NEXT:    fcvtzs v0.2d, v1.2d
+; CHECK-GI-NEXT:    fcvtzs v2.2d, v2.2d
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = fptosi <3 x float> %a to <3 x i64>
@@ -2614,14 +2670,16 @@ define <3 x i64> @fptou_v3f32_v3i64(<3 x float> %a) {
 ;
 ; CHECK-GI-LABEL: fptou_v3f32_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov v1.s[0], v0.s[2]
-; CHECK-GI-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v0.s[0], v0.s[2]
 ; CHECK-GI-NEXT:    fcvtl v1.2d, v1.2s
-; CHECK-GI-NEXT:    fcvtzu v0.2d, v0.2d
-; CHECK-GI-NEXT:    fcvtzu v2.2d, v1.2d
+; CHECK-GI-NEXT:    fcvtl v2.2d, v0.2s
+; CHECK-GI-NEXT:    fcvtzu v0.2d, v1.2d
+; CHECK-GI-NEXT:    fcvtzu v2.2d, v2.2d
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = fptoui <3 x float> %a to <3 x i64>
@@ -3025,20 +3083,42 @@ entry:
 }
 
 define <3 x i32> @fptos_v3f32_v3i32(<3 x float> %a) {
-; CHECK-LABEL: fptos_v3f32_v3i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fptos_v3f32_v3i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fptos_v3f32_v3i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fptosi <3 x float> %a to <3 x i32>
   ret <3 x i32> %c
 }
 
 define <3 x i32> @fptou_v3f32_v3i32(<3 x float> %a) {
-; CHECK-LABEL: fptou_v3f32_v3i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fptou_v3f32_v3i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fptou_v3f32_v3i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fptoui <3 x float> %a to <3 x i32>
   ret <3 x i32> %c
@@ -3172,22 +3252,48 @@ entry:
 }
 
 define <3 x i16> @fptos_v3f32_v3i16(<3 x float> %a) {
-; CHECK-LABEL: fptos_v3f32_v3i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fptos_v3f32_v3i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fptos_v3f32_v3i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    fcvtzs v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fptosi <3 x float> %a to <3 x i16>
   ret <3 x i16> %c
 }
 
 define <3 x i16> @fptou_v3f32_v3i16(<3 x float> %a) {
-; CHECK-LABEL: fptou_v3f32_v3i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
-; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fptou_v3f32_v3i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fptou_v3f32_v3i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    fcvtzu v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fptoui <3 x float> %a to <3 x i16>
   ret <3 x i16> %c
@@ -3414,7 +3520,10 @@ define <3 x i8> @fptos_v3f32_v3i8(<3 x float> %a) {
 ;
 ; CHECK-GI-LABEL: fptos_v3f32_v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    fcvtzs v0.4s, v1.4s
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    mov s2, v0.s[2]
 ; CHECK-GI-NEXT:    fmov w0, s0
@@ -3438,7 +3547,10 @@ define <3 x i8> @fptou_v3f32_v3i8(<3 x float> %a) {
 ;
 ; CHECK-GI-LABEL: fptou_v3f32_v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    fcvtzu v0.4s, v1.4s
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    mov s2, v0.s[2]
 ; CHECK-GI-NEXT:    fmov w0, s0
@@ -4056,7 +4168,11 @@ define <3 x i64> @fptos_v3f16_v3i64(<3 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fptos_v3f16_v3i64:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v1.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.2d, v0.2s
 ; CHECK-GI-NOFP16-NEXT:    fcvtl2 v2.2d, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    fcvtzs v0.2d, v1.2d
@@ -4120,7 +4236,11 @@ define <3 x i64> @fptou_v3f16_v3i64(<3 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fptou_v3f16_v3i64:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v1.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.2d, v0.2s
 ; CHECK-GI-NOFP16-NEXT:    fcvtl2 v2.2d, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    fcvtzu v0.2d, v1.2d
@@ -5729,22 +5849,48 @@ entry:
 }
 
 define <3 x i32> @fptos_v3f16_v3i32(<3 x half> %a) {
-; CHECK-LABEL: fptos_v3f16_v3i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fptos_v3f16_v3i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-SD-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fptos_v3f16_v3i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    fcvtl v0.4s, v1.4h
+; CHECK-GI-NEXT:    fcvtzs v1.4s, v0.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fptosi <3 x half> %a to <3 x i32>
   ret <3 x i32> %c
 }
 
 define <3 x i32> @fptou_v3f16_v3i32(<3 x half> %a) {
-; CHECK-LABEL: fptou_v3f16_v3i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fptou_v3f16_v3i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-SD-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fptou_v3f16_v3i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    fcvtl v0.4s, v1.4h
+; CHECK-GI-NEXT:    fcvtzu v1.4s, v0.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fptoui <3 x half> %a to <3 x i32>
   ret <3 x i32> %c
@@ -6027,14 +6173,37 @@ define <3 x i16> @fptos_v3f16_v3i16(<3 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fptos_v3f16_v3i16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v1.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fptos_v3f16_v3i16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fcvtzs v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-FP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT:    fcvtzs v1.4h, v1.4h
+; CHECK-GI-FP16-NEXT:    umov w8, v1.h[0]
+; CHECK-GI-FP16-NEXT:    umov w9, v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.s[0], w8
+; CHECK-GI-FP16-NEXT:    umov w8, v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v0.s[1], w9
+; CHECK-GI-FP16-NEXT:    mov v0.s[2], w8
+; CHECK-GI-FP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-FP16-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], w8
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], w9
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fptosi <3 x half> %a to <3 x i16>
@@ -6056,14 +6225,37 @@ define <3 x i16> @fptou_v3f16_v3i16(<3 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fptou_v3f16_v3i16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v1.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtzu v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fptou_v3f16_v3i16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fcvtzu v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-FP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT:    fcvtzu v1.4h, v1.4h
+; CHECK-GI-FP16-NEXT:    umov w8, v1.h[0]
+; CHECK-GI-FP16-NEXT:    umov w9, v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.s[0], w8
+; CHECK-GI-FP16-NEXT:    umov w8, v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v0.s[1], w9
+; CHECK-GI-FP16-NEXT:    mov v0.s[2], w8
+; CHECK-GI-FP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-FP16-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], w8
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], w9
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fptoui <3 x half> %a to <3 x i16>
@@ -6493,7 +6685,11 @@ define <3 x i8> @fptos_v3f16_v3i8(<3 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fptos_v3f16_v3i8:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v1.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtzs v0.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    mov s2, v0.s[2]
@@ -6504,10 +6700,22 @@ define <3 x i8> @fptos_v3f16_v3i8(<3 x half> %a) {
 ;
 ; CHECK-GI-FP16-LABEL: fptos_v3f16_v3i8:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fcvtzs v0.4h, v0.4h
-; CHECK-GI-FP16-NEXT:    umov w0, v0.h[0]
-; CHECK-GI-FP16-NEXT:    umov w1, v0.h[1]
-; CHECK-GI-FP16-NEXT:    umov w2, v0.h[2]
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-FP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT:    fcvtzs v0.4h, v1.4h
+; CHECK-GI-FP16-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-FP16-NEXT:    umov w9, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v1.s[0], w8
+; CHECK-GI-FP16-NEXT:    umov w8, v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov v1.s[1], w9
+; CHECK-GI-FP16-NEXT:    mov v1.s[2], w8
+; CHECK-GI-FP16-NEXT:    mov s0, v1.s[1]
+; CHECK-GI-FP16-NEXT:    mov s2, v1.s[2]
+; CHECK-GI-FP16-NEXT:    fmov w0, s1
+; CHECK-GI-FP16-NEXT:    fmov w1, s0
+; CHECK-GI-FP16-NEXT:    fmov w2, s2
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fptosi <3 x half> %a to <3 x i8>
@@ -6535,7 +6743,11 @@ define <3 x i8> @fptou_v3f16_v3i8(<3 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fptou_v3f16_v3i8:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v1.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtzu v0.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    mov s2, v0.s[2]
@@ -6546,10 +6758,22 @@ define <3 x i8> @fptou_v3f16_v3i8(<3 x half> %a) {
 ;
 ; CHECK-GI-FP16-LABEL: fptou_v3f16_v3i8:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fcvtzu v0.4h, v0.4h
-; CHECK-GI-FP16-NEXT:    umov w0, v0.h[0]
-; CHECK-GI-FP16-NEXT:    umov w1, v0.h[1]
-; CHECK-GI-FP16-NEXT:    umov w2, v0.h[2]
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-FP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT:    fcvtzu v0.4h, v1.4h
+; CHECK-GI-FP16-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-FP16-NEXT:    umov w9, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v1.s[0], w8
+; CHECK-GI-FP16-NEXT:    umov w8, v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov v1.s[1], w9
+; CHECK-GI-FP16-NEXT:    mov v1.s[2], w8
+; CHECK-GI-FP16-NEXT:    mov s0, v1.s[1]
+; CHECK-GI-FP16-NEXT:    mov s2, v1.s[2]
+; CHECK-GI-FP16-NEXT:    fmov w0, s1
+; CHECK-GI-FP16-NEXT:    fmov w1, s0
+; CHECK-GI-FP16-NEXT:    fmov w2, s2
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fptoui <3 x half> %a to <3 x i8>
@@ -7323,11 +7547,14 @@ define <3 x i64> @fptos_v3f128_v3i64(<3 x fp128> %a) {
 ; CHECK-GI-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov x20, x0
 ; CHECK-GI-NEXT:    bl __fixtfdi
-; CHECK-GI-NEXT:    fmov d0, x19
-; CHECK-GI-NEXT:    fmov d1, x20
+; CHECK-GI-NEXT:    mov v0.d[0], x19
+; CHECK-GI-NEXT:    mov v2.d[0], x0
 ; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    mov v0.d[1], x20
 ; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    fmov d2, x0
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -7380,11 +7607,14 @@ define <3 x i64> @fptou_v3f128_v3i64(<3 x fp128> %a) {
 ; CHECK-GI-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov x20, x0
 ; CHECK-GI-NEXT:    bl __fixunstfdi
-; CHECK-GI-NEXT:    fmov d0, x19
-; CHECK-GI-NEXT:    fmov d1, x20
+; CHECK-GI-NEXT:    mov v0.d[0], x19
+; CHECK-GI-NEXT:    mov v2.d[0], x0
 ; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    mov v0.d[1], x20
 ; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    fmov d2, x0
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -7519,11 +7749,14 @@ define <3 x i32> @fptos_v3f128_v3i32(<3 x fp128> %a) {
 ; CHECK-GI-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w20, w0
 ; CHECK-GI-NEXT:    bl __fixtfsi
-; CHECK-GI-NEXT:    mov v0.s[0], w19
+; CHECK-GI-NEXT:    mov v1.s[0], w19
 ; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v0.s[1], w20
+; CHECK-GI-NEXT:    mov v1.s[1], w20
 ; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v0.s[2], w0
+; CHECK-GI-NEXT:    mov v1.s[2], w0
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -7572,11 +7805,14 @@ define <3 x i32> @fptou_v3f128_v3i32(<3 x fp128> %a) {
 ; CHECK-GI-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w20, w0
 ; CHECK-GI-NEXT:    bl __fixunstfsi
-; CHECK-GI-NEXT:    mov v0.s[0], w19
+; CHECK-GI-NEXT:    mov v1.s[0], w19
 ; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v0.s[1], w20
+; CHECK-GI-NEXT:    mov v1.s[1], w20
 ; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v0.s[2], w0
+; CHECK-GI-NEXT:    mov v1.s[2], w0
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -7714,11 +7950,15 @@ define <3 x i16> @fptos_v3f128_v3i16(<3 x fp128> %a) {
 ; CHECK-GI-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w20, w0
 ; CHECK-GI-NEXT:    bl __fixtfsi
-; CHECK-GI-NEXT:    fmov s0, w19
+; CHECK-GI-NEXT:    mov v0.s[0], w19
 ; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v0.h[1], w20
+; CHECK-GI-NEXT:    mov v0.s[1], w20
 ; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v0.h[2], w0
+; CHECK-GI-NEXT:    mov v0.s[2], w0
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
@@ -7771,11 +8011,15 @@ define <3 x i16> @fptou_v3f128_v3i16(<3 x fp128> %a) {
 ; CHECK-GI-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w20, w0
 ; CHECK-GI-NEXT:    bl __fixunstfsi
-; CHECK-GI-NEXT:    fmov s0, w19
+; CHECK-GI-NEXT:    mov v0.s[0], w19
 ; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v0.h[1], w20
+; CHECK-GI-NEXT:    mov v0.s[1], w20
 ; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v0.h[2], w0
+; CHECK-GI-NEXT:    mov v0.s[2], w0
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
@@ -7917,11 +8161,16 @@ define <3 x i8> @fptos_v3f128_v3i8(<3 x fp128> %a) {
 ; CHECK-GI-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w20, w0
 ; CHECK-GI-NEXT:    bl __fixtfsi
-; CHECK-GI-NEXT:    mov w2, w0
-; CHECK-GI-NEXT:    mov w0, w19
-; CHECK-GI-NEXT:    mov w1, w20
-; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v0.s[0], w19
 ; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v0.s[1], w20
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v0.s[2], w0
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    fmov w1, s1
+; CHECK-GI-NEXT:    fmov w2, s2
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -7976,11 +8225,16 @@ define <3 x i8> @fptou_v3f128_v3i8(<3 x fp128> %a) {
 ; CHECK-GI-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w20, w0
 ; CHECK-GI-NEXT:    bl __fixunstfsi
-; CHECK-GI-NEXT:    mov w2, w0
-; CHECK-GI-NEXT:    mov w0, w19
-; CHECK-GI-NEXT:    mov w1, w20
-; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v0.s[0], w19
 ; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v0.s[1], w20
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v0.s[2], w0
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    fmov w1, s1
+; CHECK-GI-NEXT:    fmov w2, s2
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index 9ef6d61c350ecf..a7c51ea2b9ace1 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -48,10 +48,21 @@ define <2 x i32> @test_signed_v2f32_v2i32(<2 x float> %f) {
 }
 
 define <3 x i32> @test_signed_v3f32_v3i32(<3 x float> %f) {
-; CHECK-LABEL: test_signed_v3f32_v3i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_signed_v3f32_v3i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_signed_v3f32_v3i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
     %x = call <3 x i32> @llvm.fptosi.sat.v3f32.v3i32(<3 x float> %f)
     ret <3 x i32> %x
 }
@@ -320,7 +331,10 @@ define <3 x i32> @test_signed_v3f64_v3i32(<3 x double> %f) {
 ; CHECK-GI-NEXT:    cmgt v3.2d, v0.2d, v2.2d
 ; CHECK-GI-NEXT:    bif v1.16b, v2.16b, v4.16b
 ; CHECK-GI-NEXT:    bif v0.16b, v2.16b, v3.16b
-; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    uzp1 v1.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    ret
     %x = call <3 x i32> @llvm.fptosi.sat.v3f64.v3i32(<3 x double> %f)
     ret <3 x i32> %x
@@ -383,36 +397,35 @@ define <5 x i32> @test_signed_v5f64_v5i32(<5 x double> %f) {
 ; CHECK-GI-NEXT:    // kill: def $d4 killed $d4 def $q4
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v2.d[1], v3.d[0]
-; CHECK-GI-NEXT:    fcvtzs v3.2d, v4.2d
+; CHECK-GI-NEXT:    fcvtzs v4.2d, v4.2d
 ; CHECK-GI-NEXT:    fcvtzs v0.2d, v0.2d
 ; CHECK-GI-NEXT:    fcvtzs v1.2d, v2.2d
 ; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI12_1]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI12_0
-; CHECK-GI-NEXT:    cmgt v4.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    cmgt v3.2d, v2.2d, v0.2d
 ; CHECK-GI-NEXT:    cmgt v5.2d, v2.2d, v1.2d
-; CHECK-GI-NEXT:    bif v0.16b, v2.16b, v4.16b
+; CHECK-GI-NEXT:    bif v0.16b, v2.16b, v3.16b
 ; CHECK-GI-NEXT:    bif v1.16b, v2.16b, v5.16b
+; CHECK-GI-NEXT:    cmgt v5.2d, v2.2d, v4.2d
+; CHECK-GI-NEXT:    ldr q3, [x8, :lo12:.LCPI12_0]
+; CHECK-GI-NEXT:    bit v2.16b, v4.16b, v5.16b
+; CHECK-GI-NEXT:    cmgt v6.2d, v0.2d, v3.2d
+; CHECK-GI-NEXT:    cmgt v7.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    bif v0.16b, v3.16b, v6.16b
+; CHECK-GI-NEXT:    bif v1.16b, v3.16b, v7.16b
 ; CHECK-GI-NEXT:    cmgt v4.2d, v2.2d, v3.2d
-; CHECK-GI-NEXT:    ldr q5, [x8, :lo12:.LCPI12_0]
-; CHECK-GI-NEXT:    bit v2.16b, v3.16b, v4.16b
-; CHECK-GI-NEXT:    cmgt v3.2d, v0.2d, v5.2d
-; CHECK-GI-NEXT:    cmgt v4.2d, v1.2d, v5.2d
-; CHECK-GI-NEXT:    bif v0.16b, v5.16b, v3.16b
-; CHECK-GI-NEXT:    bif v1.16b, v5.16b, v4.16b
-; CHECK-GI-NEXT:    cmgt v3.2d, v2.2d, v5.2d
-; CHECK-GI-NEXT:    bif v2.16b, v5.16b, v3.16b
-; CHECK-GI-NEXT:    mov d3, v0.d[1]
-; CHECK-GI-NEXT:    mov d4, v1.d[1]
-; CHECK-GI-NEXT:    fmov x0, d0
-; CHECK-GI-NEXT:    fmov x2, d1
-; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 killed $x0
-; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 killed $x2
-; CHECK-GI-NEXT:    fmov x4, d2
-; CHECK-GI-NEXT:    fmov x1, d3
-; CHECK-GI-NEXT:    fmov x3, d4
-; CHECK-GI-NEXT:    // kill: def $w4 killed $w4 killed $x4
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 killed $x1
-; CHECK-GI-NEXT:    // kill: def $w3 killed $w3 killed $x3
+; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov v1.16b, v4.16b
+; CHECK-GI-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v0.s[2]
+; CHECK-GI-NEXT:    mov s4, v0.s[3]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    mov v1.s[0], v1.s[0]
+; CHECK-GI-NEXT:    fmov w1, s2
+; CHECK-GI-NEXT:    fmov w2, s3
+; CHECK-GI-NEXT:    fmov w3, s4
+; CHECK-GI-NEXT:    fmov w4, s1
 ; CHECK-GI-NEXT:    ret
     %x = call <5 x i32> @llvm.fptosi.sat.v5f64.v5i32(<5 x double> %f)
     ret <5 x i32> %x
@@ -431,49 +444,49 @@ define <6 x i32> @test_signed_v6f64_v6i32(<6 x double> %f) {
 ;
 ; CHECK-GI-LABEL: test_signed_v6f64_v6i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-GI-NEXT:    // kill: def $d4 killed $d4 def $q4
+; CHECK-GI-NEXT:    // kill: def $d5 killed $d5 def $q5
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-GI-NEXT:    // kill: def $d3 killed $d3 def $q3
-; CHECK-GI-NEXT:    // kill: def $d5 killed $d5 def $q5
 ; CHECK-GI-NEXT:    adrp x8, .LCPI13_1
+; CHECK-GI-NEXT:    mov v4.d[1], v5.d[0]
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v2.d[1], v3.d[0]
-; CHECK-GI-NEXT:    mov v4.d[1], v5.d[0]
 ; CHECK-GI-NEXT:    ldr q3, [x8, :lo12:.LCPI13_1]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI13_0
+; CHECK-GI-NEXT:    ldr q6, [x8, :lo12:.LCPI13_0]
+; CHECK-GI-NEXT:    fcvtzs v1.2d, v4.2d
 ; CHECK-GI-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-GI-NEXT:    fcvtzs v1.2d, v2.2d
-; CHECK-GI-NEXT:    fcvtzs v2.2d, v4.2d
+; CHECK-GI-NEXT:    fcvtzs v2.2d, v2.2d
+; CHECK-GI-NEXT:    cmgt v4.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT:    cmgt v5.2d, v3.2d, v2.2d
+; CHECK-GI-NEXT:    bif v1.16b, v3.16b, v4.16b
 ; CHECK-GI-NEXT:    cmgt v4.2d, v3.2d, v0.2d
-; CHECK-GI-NEXT:    cmgt v5.2d, v3.2d, v1.2d
-; CHECK-GI-NEXT:    cmgt v6.2d, v3.2d, v2.2d
-; CHECK-GI-NEXT:    bif v0.16b, v3.16b, v4.16b
-; CHECK-GI-NEXT:    bif v1.16b, v3.16b, v5.16b
-; CHECK-GI-NEXT:    bif v2.16b, v3.16b, v6.16b
-; CHECK-GI-NEXT:    ldr q3, [x8, :lo12:.LCPI13_0]
-; CHECK-GI-NEXT:    cmgt v4.2d, v0.2d, v3.2d
-; CHECK-GI-NEXT:    cmgt v5.2d, v1.2d, v3.2d
-; CHECK-GI-NEXT:    cmgt v6.2d, v2.2d, v3.2d
+; CHECK-GI-NEXT:    bif v2.16b, v3.16b, v5.16b
 ; CHECK-GI-NEXT:    bif v0.16b, v3.16b, v4.16b
-; CHECK-GI-NEXT:    bif v1.16b, v3.16b, v5.16b
-; CHECK-GI-NEXT:    bif v2.16b, v3.16b, v6.16b
-; CHECK-GI-NEXT:    mov d3, v0.d[1]
-; CHECK-GI-NEXT:    mov d4, v1.d[1]
-; CHECK-GI-NEXT:    mov d5, v2.d[1]
-; CHECK-GI-NEXT:    fmov x0, d0
-; CHECK-GI-NEXT:    fmov x2, d1
-; CHECK-GI-NEXT:    fmov x4, d2
-; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 killed $x0
-; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 killed $x2
-; CHECK-GI-NEXT:    // kill: def $w4 killed $w4 killed $x4
-; CHECK-GI-NEXT:    fmov x1, d3
-; CHECK-GI-NEXT:    fmov x3, d4
-; CHECK-GI-NEXT:    fmov x5, d5
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 killed $x1
-; CHECK-GI-NEXT:    // kill: def $w3 killed $w3 killed $x3
-; CHECK-GI-NEXT:    // kill: def $w5 killed $w5 killed $x5
+; CHECK-GI-NEXT:    cmgt v3.2d, v1.2d, v6.2d
+; CHECK-GI-NEXT:    cmgt v4.2d, v2.2d, v6.2d
+; CHECK-GI-NEXT:    bif v1.16b, v6.16b, v3.16b
+; CHECK-GI-NEXT:    cmgt v3.2d, v0.2d, v6.2d
+; CHECK-GI-NEXT:    bif v2.16b, v6.16b, v4.16b
+; CHECK-GI-NEXT:    bif v0.16b, v6.16b, v3.16b
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
+; CHECK-GI-NEXT:    mov v1.s[0], v1.s[0]
+; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    fmov x8, d3
+; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v0.s[2]
+; CHECK-GI-NEXT:    mov s4, v0.s[3]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    mov s5, v1.s[1]
+; CHECK-GI-NEXT:    fmov w1, s2
+; CHECK-GI-NEXT:    fmov w2, s3
+; CHECK-GI-NEXT:    fmov w3, s4
+; CHECK-GI-NEXT:    fmov w4, s1
+; CHECK-GI-NEXT:    fmov w5, s5
 ; CHECK-GI-NEXT:    ret
     %x = call <6 x i32> @llvm.fptosi.sat.v6f64.v6i32(<6 x double> %f)
     ret <6 x i32> %x
@@ -902,14 +915,17 @@ define <3 x i32> @test_signed_v3f128_v3i32(<3 x fp128> %f) {
 ; CHECK-GI-NEXT:    mov w19, w0
 ; CHECK-GI-NEXT:    mov v1.16b, v0.16b
 ; CHECK-GI-NEXT:    bl __unordtf2
-; CHECK-GI-NEXT:    mov v0.s[0], w21
+; CHECK-GI-NEXT:    mov v1.s[0], w21
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    csel w8, wzr, w19, ne
 ; CHECK-GI-NEXT:    ldp x20, x19, [sp, #112] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp x22, x21, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v0.s[1], w23
+; CHECK-GI-NEXT:    mov v1.s[1], w23
 ; CHECK-GI-NEXT:    ldp x30, x23, [sp, #80] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    add sp, sp, #128
 ; CHECK-GI-NEXT:    ret
     %x = call <3 x i32> @llvm.fptosi.sat.v3f128.v3i32(<3 x fp128> %f)
@@ -1221,11 +1237,24 @@ define <2 x i32> @test_signed_v2f16_v2i32(<2 x half> %f) {
 }
 
 define <3 x i32> @test_signed_v3f16_v3i32(<3 x half> %f) {
-; CHECK-LABEL: test_signed_v3f16_v3i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_signed_v3f16_v3i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-SD-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_signed_v3f16_v3i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    fcvtl v0.4s, v1.4h
+; CHECK-GI-NEXT:    fcvtzs v1.4s, v0.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
     %x = call <3 x i32> @llvm.fptosi.sat.v3f16.v3i32(<3 x half> %f)
     ret <3 x i32> %x
 }
@@ -1256,18 +1285,22 @@ define <5 x i32> @test_signed_v5f16_v5i32(<5 x half> %f) {
 ;
 ; CHECK-GI-LABEL: test_signed_v5f16_v5i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
 ; CHECK-GI-NEXT:    mov v0.h[0], v0.h[4]
-; CHECK-GI-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-GI-NEXT:    fcvtl v1.4s, v1.4h
 ; CHECK-GI-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-GI-NEXT:    mov s2, v1.s[1]
+; CHECK-GI-NEXT:    fcvtzs v1.4s, v1.4s
 ; CHECK-GI-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-GI-NEXT:    mov s2, v1.s[1]
 ; CHECK-GI-NEXT:    mov s3, v1.s[2]
 ; CHECK-GI-NEXT:    mov s4, v1.s[3]
 ; CHECK-GI-NEXT:    fmov w0, s1
+; CHECK-GI-NEXT:    fmov w4, s0
 ; CHECK-GI-NEXT:    fmov w1, s2
 ; CHECK-GI-NEXT:    fmov w2, s3
-; CHECK-GI-NEXT:    fmov w4, s0
 ; CHECK-GI-NEXT:    fmov w3, s4
 ; CHECK-GI-NEXT:    ret
     %x = call <5 x i32> @llvm.fptosi.sat.v5f16.v5i32(<5 x half> %f)
@@ -1291,22 +1324,26 @@ define <6 x i32> @test_signed_v6f16_v6i32(<6 x half> %f) {
 ;
 ; CHECK-GI-LABEL: test_signed_v6f16_v6i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov v1.h[0], v0.h[4]
-; CHECK-GI-NEXT:    mov v1.h[1], v0.h[5]
-; CHECK-GI-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v2.h[0], v0.h[4]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v2.h[1], v0.h[5]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT:    fcvtl v0.4s, v1.4h
+; CHECK-GI-NEXT:    fcvtl v1.4s, v2.4h
 ; CHECK-GI-NEXT:    fcvtzs v0.4s, v0.4s
 ; CHECK-GI-NEXT:    fcvtzs v1.4s, v1.4s
 ; CHECK-GI-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-NEXT:    mov s3, v0.s[2]
-; CHECK-GI-NEXT:    mov s4, v0.s[3]
+; CHECK-GI-NEXT:    mov s4, v1.s[1]
+; CHECK-GI-NEXT:    mov s5, v0.s[3]
 ; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    mov s5, v1.s[1]
+; CHECK-GI-NEXT:    fmov w4, s1
 ; CHECK-GI-NEXT:    fmov w1, s2
 ; CHECK-GI-NEXT:    fmov w2, s3
-; CHECK-GI-NEXT:    fmov w3, s4
-; CHECK-GI-NEXT:    fmov w4, s1
-; CHECK-GI-NEXT:    fmov w5, s5
+; CHECK-GI-NEXT:    fmov w5, s4
+; CHECK-GI-NEXT:    fmov w3, s5
 ; CHECK-GI-NEXT:    ret
     %x = call <6 x i32> @llvm.fptosi.sat.v6f16.v6i32(<6 x half> %f)
     ret <6 x i32> %x
@@ -1330,23 +1367,27 @@ define <7 x i32> @test_signed_v7f16_v7i32(<7 x half> %f) {
 ;
 ; CHECK-GI-LABEL: test_signed_v7f16_v7i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov v1.h[0], v0.h[4]
-; CHECK-GI-NEXT:    mov v1.h[1], v0.h[5]
-; CHECK-GI-NEXT:    mov v1.h[2], v0.h[6]
-; CHECK-GI-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v2.h[0], v0.h[4]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v2.h[1], v0.h[5]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT:    fcvtl v0.4s, v1.4h
+; CHECK-GI-NEXT:    fcvtl v1.4s, v2.4h
 ; CHECK-GI-NEXT:    fcvtzs v0.4s, v0.4s
 ; CHECK-GI-NEXT:    fcvtzs v1.4s, v1.4s
 ; CHECK-GI-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-NEXT:    mov s3, v0.s[2]
 ; CHECK-GI-NEXT:    mov s4, v0.s[3]
-; CHECK-GI-NEXT:    fmov w0, s0
 ; CHECK-GI-NEXT:    mov s5, v1.s[1]
 ; CHECK-GI-NEXT:    mov s6, v1.s[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    fmov w4, s1
 ; CHECK-GI-NEXT:    fmov w1, s2
 ; CHECK-GI-NEXT:    fmov w2, s3
 ; CHECK-GI-NEXT:    fmov w3, s4
-; CHECK-GI-NEXT:    fmov w4, s1
 ; CHECK-GI-NEXT:    fmov w5, s5
 ; CHECK-GI-NEXT:    fmov w6, s6
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index e1670ad2dc053b..eb68125080f33a 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -48,10 +48,21 @@ define <2 x i32> @test_unsigned_v2f32_v2i32(<2 x float> %f) {
 }
 
 define <3 x i32> @test_unsigned_v3f32_v3i32(<3 x float> %f) {
-; CHECK-LABEL: test_unsigned_v3f32_v3i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_unsigned_v3f32_v3i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_unsigned_v3f32_v3i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
     %x = call <3 x i32> @llvm.fptoui.sat.v3f32.v3i32(<3 x float> %f)
     ret <3 x i32> %x
 }
@@ -308,7 +319,10 @@ define <3 x i32> @test_unsigned_v3f64_v3i32(<3 x double> %f) {
 ; CHECK-GI-NEXT:    bif v1.16b, v2.16b, v4.16b
 ; CHECK-GI-NEXT:    cmhi v3.2d, v2.2d, v0.2d
 ; CHECK-GI-NEXT:    bif v0.16b, v2.16b, v3.16b
-; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    uzp1 v1.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    ret
     %x = call <3 x i32> @llvm.fptoui.sat.v3f64.v3i32(<3 x double> %f)
     ret <3 x i32> %x
@@ -364,27 +378,25 @@ define <5 x i32> @test_unsigned_v5f64_v5i32(<5 x double> %f) {
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v2.d[1], v3.d[0]
 ; CHECK-GI-NEXT:    movi v1.2d, #0x000000ffffffff
-; CHECK-GI-NEXT:    fcvtzu v3.2d, v4.2d
+; CHECK-GI-NEXT:    fcvtzu v4.2d, v4.2d
 ; CHECK-GI-NEXT:    fcvtzu v0.2d, v0.2d
 ; CHECK-GI-NEXT:    fcvtzu v2.2d, v2.2d
-; CHECK-GI-NEXT:    cmhi v4.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    cmhi v3.2d, v1.2d, v0.2d
 ; CHECK-GI-NEXT:    cmhi v5.2d, v1.2d, v2.2d
-; CHECK-GI-NEXT:    bif v0.16b, v1.16b, v4.16b
+; CHECK-GI-NEXT:    bif v0.16b, v1.16b, v3.16b
 ; CHECK-GI-NEXT:    bif v2.16b, v1.16b, v5.16b
-; CHECK-GI-NEXT:    cmhi v4.2d, v1.2d, v3.2d
-; CHECK-GI-NEXT:    bit v1.16b, v3.16b, v4.16b
-; CHECK-GI-NEXT:    mov d3, v0.d[1]
-; CHECK-GI-NEXT:    mov d4, v2.d[1]
-; CHECK-GI-NEXT:    fmov x0, d0
-; CHECK-GI-NEXT:    fmov x2, d2
-; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 killed $x0
-; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 killed $x2
-; CHECK-GI-NEXT:    fmov x4, d1
-; CHECK-GI-NEXT:    fmov x1, d3
-; CHECK-GI-NEXT:    fmov x3, d4
-; CHECK-GI-NEXT:    // kill: def $w4 killed $w4 killed $x4
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 killed $x1
-; CHECK-GI-NEXT:    // kill: def $w3 killed $w3 killed $x3
+; CHECK-GI-NEXT:    cmhi v3.2d, v1.2d, v4.2d
+; CHECK-GI-NEXT:    bit v1.16b, v4.16b, v3.16b
+; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    mov v1.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v0.s[2]
+; CHECK-GI-NEXT:    mov s4, v0.s[3]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    fmov w4, s1
+; CHECK-GI-NEXT:    fmov w1, s2
+; CHECK-GI-NEXT:    fmov w2, s3
+; CHECK-GI-NEXT:    fmov w3, s4
 ; CHECK-GI-NEXT:    ret
     %x = call <5 x i32> @llvm.fptoui.sat.v5f64.v5i32(<5 x double> %f)
     ret <5 x i32> %x
@@ -403,40 +415,40 @@ define <6 x i32> @test_unsigned_v6f64_v6i32(<6 x double> %f) {
 ;
 ; CHECK-GI-LABEL: test_unsigned_v6f64_v6i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-GI-NEXT:    // kill: def $d4 killed $d4 def $q4
+; CHECK-GI-NEXT:    // kill: def $d5 killed $d5 def $q5
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-GI-NEXT:    // kill: def $d3 killed $d3 def $q3
-; CHECK-GI-NEXT:    // kill: def $d5 killed $d5 def $q5
+; CHECK-GI-NEXT:    mov v4.d[1], v5.d[0]
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v2.d[1], v3.d[0]
-; CHECK-GI-NEXT:    mov v4.d[1], v5.d[0]
-; CHECK-GI-NEXT:    movi v1.2d, #0x000000ffffffff
+; CHECK-GI-NEXT:    movi v3.2d, #0x000000ffffffff
+; CHECK-GI-NEXT:    fcvtzu v1.2d, v4.2d
 ; CHECK-GI-NEXT:    fcvtzu v0.2d, v0.2d
 ; CHECK-GI-NEXT:    fcvtzu v2.2d, v2.2d
-; CHECK-GI-NEXT:    fcvtzu v3.2d, v4.2d
-; CHECK-GI-NEXT:    cmhi v4.2d, v1.2d, v0.2d
-; CHECK-GI-NEXT:    cmhi v5.2d, v1.2d, v2.2d
-; CHECK-GI-NEXT:    cmhi v6.2d, v1.2d, v3.2d
-; CHECK-GI-NEXT:    bif v0.16b, v1.16b, v4.16b
-; CHECK-GI-NEXT:    bif v2.16b, v1.16b, v5.16b
-; CHECK-GI-NEXT:    bit v1.16b, v3.16b, v6.16b
-; CHECK-GI-NEXT:    mov d3, v0.d[1]
-; CHECK-GI-NEXT:    mov d4, v2.d[1]
-; CHECK-GI-NEXT:    mov d5, v1.d[1]
-; CHECK-GI-NEXT:    fmov x0, d0
-; CHECK-GI-NEXT:    fmov x2, d2
-; CHECK-GI-NEXT:    fmov x4, d1
-; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 killed $x0
-; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 killed $x2
-; CHECK-GI-NEXT:    // kill: def $w4 killed $w4 killed $x4
-; CHECK-GI-NEXT:    fmov x1, d3
-; CHECK-GI-NEXT:    fmov x3, d4
-; CHECK-GI-NEXT:    fmov x5, d5
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 killed $x1
-; CHECK-GI-NEXT:    // kill: def $w3 killed $w3 killed $x3
-; CHECK-GI-NEXT:    // kill: def $w5 killed $w5 killed $x5
+; CHECK-GI-NEXT:    cmhi v4.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT:    cmhi v5.2d, v3.2d, v2.2d
+; CHECK-GI-NEXT:    bif v1.16b, v3.16b, v4.16b
+; CHECK-GI-NEXT:    cmhi v4.2d, v3.2d, v0.2d
+; CHECK-GI-NEXT:    bif v2.16b, v3.16b, v5.16b
+; CHECK-GI-NEXT:    bif v0.16b, v3.16b, v4.16b
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
+; CHECK-GI-NEXT:    mov v1.s[0], v1.s[0]
+; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    fmov x8, d3
+; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v0.s[2]
+; CHECK-GI-NEXT:    mov s4, v0.s[3]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    mov s5, v1.s[1]
+; CHECK-GI-NEXT:    fmov w1, s2
+; CHECK-GI-NEXT:    fmov w2, s3
+; CHECK-GI-NEXT:    fmov w3, s4
+; CHECK-GI-NEXT:    fmov w4, s1
+; CHECK-GI-NEXT:    fmov w5, s5
 ; CHECK-GI-NEXT:    ret
     %x = call <6 x i32> @llvm.fptoui.sat.v6f64.v6i32(<6 x double> %f)
     ret <6 x i32> %x
@@ -781,12 +793,15 @@ define <3 x i32> @test_unsigned_v3f128_v3i32(<3 x fp128> %f) {
 ; CHECK-GI-NEXT:    csel x8, x23, x21, gt
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    bl __fixunstfsi
-; CHECK-GI-NEXT:    mov v0.s[0], w19
+; CHECK-GI-NEXT:    mov v1.s[0], w19
 ; CHECK-GI-NEXT:    ldp x22, x21, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp x30, x23, [sp, #64] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v0.s[1], w20
+; CHECK-GI-NEXT:    mov v1.s[1], w20
 ; CHECK-GI-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v0.s[2], w0
+; CHECK-GI-NEXT:    mov v1.s[2], w0
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    add sp, sp, #112
 ; CHECK-GI-NEXT:    ret
     %x = call <3 x i32> @llvm.fptoui.sat.v3f128.v3i32(<3 x fp128> %f)
@@ -1052,11 +1067,24 @@ define <2 x i32> @test_unsigned_v2f16_v2i32(<2 x half> %f) {
 }
 
 define <3 x i32> @test_unsigned_v3f16_v3i32(<3 x half> %f) {
-; CHECK-LABEL: test_unsigned_v3f16_v3i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_unsigned_v3f16_v3i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-SD-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_unsigned_v3f16_v3i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    fcvtl v0.4s, v1.4h
+; CHECK-GI-NEXT:    fcvtzu v1.4s, v0.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
     %x = call <3 x i32> @llvm.fptoui.sat.v3f16.v3i32(<3 x half> %f)
     ret <3 x i32> %x
 }
@@ -1087,18 +1115,22 @@ define <5 x i32> @test_unsigned_v5f16_v5i32(<5 x half> %f) {
 ;
 ; CHECK-GI-LABEL: test_unsigned_v5f16_v5i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
 ; CHECK-GI-NEXT:    mov v0.h[0], v0.h[4]
-; CHECK-GI-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-GI-NEXT:    fcvtl v1.4s, v1.4h
 ; CHECK-GI-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-GI-NEXT:    mov s2, v1.s[1]
+; CHECK-GI-NEXT:    fcvtzu v1.4s, v1.4s
 ; CHECK-GI-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-GI-NEXT:    mov s2, v1.s[1]
 ; CHECK-GI-NEXT:    mov s3, v1.s[2]
 ; CHECK-GI-NEXT:    mov s4, v1.s[3]
 ; CHECK-GI-NEXT:    fmov w0, s1
+; CHECK-GI-NEXT:    fmov w4, s0
 ; CHECK-GI-NEXT:    fmov w1, s2
 ; CHECK-GI-NEXT:    fmov w2, s3
-; CHECK-GI-NEXT:    fmov w4, s0
 ; CHECK-GI-NEXT:    fmov w3, s4
 ; CHECK-GI-NEXT:    ret
     %x = call <5 x i32> @llvm.fptoui.sat.v5f16.v5i32(<5 x half> %f)
@@ -1122,22 +1154,26 @@ define <6 x i32> @test_unsigned_v6f16_v6i32(<6 x half> %f) {
 ;
 ; CHECK-GI-LABEL: test_unsigned_v6f16_v6i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov v1.h[0], v0.h[4]
-; CHECK-GI-NEXT:    mov v1.h[1], v0.h[5]
-; CHECK-GI-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v2.h[0], v0.h[4]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v2.h[1], v0.h[5]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT:    fcvtl v0.4s, v1.4h
+; CHECK-GI-NEXT:    fcvtl v1.4s, v2.4h
 ; CHECK-GI-NEXT:    fcvtzu v0.4s, v0.4s
 ; CHECK-GI-NEXT:    fcvtzu v1.4s, v1.4s
 ; CHECK-GI-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-NEXT:    mov s3, v0.s[2]
-; CHECK-GI-NEXT:    mov s4, v0.s[3]
+; CHECK-GI-NEXT:    mov s4, v1.s[1]
+; CHECK-GI-NEXT:    mov s5, v0.s[3]
 ; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    mov s5, v1.s[1]
+; CHECK-GI-NEXT:    fmov w4, s1
 ; CHECK-GI-NEXT:    fmov w1, s2
 ; CHECK-GI-NEXT:    fmov w2, s3
-; CHECK-GI-NEXT:    fmov w3, s4
-; CHECK-GI-NEXT:    fmov w4, s1
-; CHECK-GI-NEXT:    fmov w5, s5
+; CHECK-GI-NEXT:    fmov w5, s4
+; CHECK-GI-NEXT:    fmov w3, s5
 ; CHECK-GI-NEXT:    ret
     %x = call <6 x i32> @llvm.fptoui.sat.v6f16.v6i32(<6 x half> %f)
     ret <6 x i32> %x
@@ -1161,23 +1197,27 @@ define <7 x i32> @test_unsigned_v7f16_v7i32(<7 x half> %f) {
 ;
 ; CHECK-GI-LABEL: test_unsigned_v7f16_v7i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov v1.h[0], v0.h[4]
-; CHECK-GI-NEXT:    mov v1.h[1], v0.h[5]
-; CHECK-GI-NEXT:    mov v1.h[2], v0.h[6]
-; CHECK-GI-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v2.h[0], v0.h[4]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v2.h[1], v0.h[5]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT:    fcvtl v0.4s, v1.4h
+; CHECK-GI-NEXT:    fcvtl v1.4s, v2.4h
 ; CHECK-GI-NEXT:    fcvtzu v0.4s, v0.4s
 ; CHECK-GI-NEXT:    fcvtzu v1.4s, v1.4s
 ; CHECK-GI-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-NEXT:    mov s3, v0.s[2]
 ; CHECK-GI-NEXT:    mov s4, v0.s[3]
-; CHECK-GI-NEXT:    fmov w0, s0
 ; CHECK-GI-NEXT:    mov s5, v1.s[1]
 ; CHECK-GI-NEXT:    mov s6, v1.s[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    fmov w4, s1
 ; CHECK-GI-NEXT:    fmov w1, s2
 ; CHECK-GI-NEXT:    fmov w2, s3
 ; CHECK-GI-NEXT:    fmov w3, s4
-; CHECK-GI-NEXT:    fmov w4, s1
 ; CHECK-GI-NEXT:    fmov w5, s5
 ; CHECK-GI-NEXT:    fmov w6, s6
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll
index 2187717c4148ae..89ac7dbe42487d 100644
--- a/llvm/test/CodeGen/AArch64/fptrunc.ll
+++ b/llvm/test/CodeGen/AArch64/fptrunc.ll
@@ -130,9 +130,12 @@ define <2 x half> @fptrunc_v2f128_v2f16(<2 x fp128> %a) {
 ; CHECK-GI-NEXT:    bl __trunctfhf2
 ; CHECK-GI-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    bl __trunctfhf2
-; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[0]
+; CHECK-GI-NEXT:    mov h0, v1.h[1]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[0]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
@@ -261,10 +264,13 @@ define <3 x float> @fptrunc_v3f64_v3f32(<3 x double> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    fcvt s2, d2
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT:    fcvtn v1.2s, v0.2d
+; CHECK-GI-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
-; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = fptrunc <3 x double> %a to <3 x float>
@@ -295,6 +301,8 @@ define <2 x half> @fptrunc_v2f64_v2f16(<2 x double> %a) {
 ; CHECK-GI-NEXT:    fcvt h0, d0
 ; CHECK-GI-NEXT:    fcvt h1, d1
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -318,8 +326,16 @@ define <3 x half> @fptrunc_v3f64_v3f16(<3 x double> %a) {
 ; CHECK-GI-NEXT:    fcvt h0, d0
 ; CHECK-GI-NEXT:    fcvt h1, d1
 ; CHECK-GI-NEXT:    fcvt h2, d2
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -366,6 +382,9 @@ define <2 x half> @fptrunc_v2f32_v2f16(<2 x float> %a) {
 ; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = fptrunc <2 x float> %a to <2 x half>
@@ -373,10 +392,29 @@ entry:
 }
 
 define <3 x half> @fptrunc_v3f32_v3f16(<3 x float> %a) {
-; CHECK-LABEL: fptrunc_v3f32_v3f16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fptrunc_v3f32_v3f16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fptrunc_v3f32_v3f16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-GI-NEXT:    umov w8, v1.h[0]
+; CHECK-GI-NEXT:    umov w9, v1.h[1]
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    umov w8, v1.h[2]
+; CHECK-GI-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fptrunc <3 x float> %a to <3 x half>
   ret <3 x half> %c
diff --git a/llvm/test/CodeGen/AArch64/frem.ll b/llvm/test/CodeGen/AArch64/frem.ll
index feb13da64cbf8a..ad8576c63b1aea 100644
--- a/llvm/test/CodeGen/AArch64/frem.ll
+++ b/llvm/test/CodeGen/AArch64/frem.ll
@@ -157,38 +157,42 @@ define <3 x double> @frem_v3f64(<3 x double> %a, <3 x double> %b) {
 ;
 ; CHECK-GI-LABEL: frem_v3f64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str d12, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    stp d11, d10, [sp, #8] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp d9, d8, [sp, #24] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    str x30, [sp, #40] // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-GI-NEXT:    .cfi_offset w30, -8
-; CHECK-GI-NEXT:    .cfi_offset b8, -16
-; CHECK-GI-NEXT:    .cfi_offset b9, -24
-; CHECK-GI-NEXT:    .cfi_offset b10, -32
-; CHECK-GI-NEXT:    .cfi_offset b11, -40
-; CHECK-GI-NEXT:    .cfi_offset b12, -48
+; CHECK-GI-NEXT:    sub sp, sp, #80
+; CHECK-GI-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    .cfi_offset b10, -40
+; CHECK-GI-NEXT:    .cfi_offset b11, -48
 ; CHECK-GI-NEXT:    fmov d8, d1
 ; CHECK-GI-NEXT:    fmov d1, d3
 ; CHECK-GI-NEXT:    fmov d9, d2
 ; CHECK-GI-NEXT:    fmov d10, d4
 ; CHECK-GI-NEXT:    fmov d11, d5
 ; CHECK-GI-NEXT:    bl fmod
-; CHECK-GI-NEXT:    fmov d12, d0
-; CHECK-GI-NEXT:    fmov d0, d8
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov d1, d10
+; CHECK-GI-NEXT:    fmov d0, d8
 ; CHECK-GI-NEXT:    bl fmod
-; CHECK-GI-NEXT:    fmov d8, d0
-; CHECK-GI-NEXT:    fmov d0, d9
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov d1, d11
+; CHECK-GI-NEXT:    fmov d0, d9
 ; CHECK-GI-NEXT:    bl fmod
-; CHECK-GI-NEXT:    fmov d1, d8
-; CHECK-GI-NEXT:    ldp d9, d8, [sp, #24] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldp d11, d10, [sp, #8] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q1, q3, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fmov d2, d0
-; CHECK-GI-NEXT:    ldr x30, [sp, #40] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    fmov d0, d12
-; CHECK-GI-NEXT:    ldr d12, [sp], #48 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT:    mov d1, v3.d[1]
+; CHECK-GI-NEXT:    fmov d0, d3
+; CHECK-GI-NEXT:    add sp, sp, #80
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = frem <3 x double> %a, %b
@@ -420,7 +424,9 @@ define <3 x float> @frem_v3f32(<3 x float> %a, <3 x float> %b) {
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    add sp, sp, #80
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -880,7 +886,13 @@ define <7 x half> @frem_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NEXT:    ldr q2, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-NEXT:    add sp, sp, #176
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/fsincos.ll b/llvm/test/CodeGen/AArch64/fsincos.ll
index 2afc56a7139fbf..eac17ec72bc990 100644
--- a/llvm/test/CodeGen/AArch64/fsincos.ll
+++ b/llvm/test/CodeGen/AArch64/fsincos.ll
@@ -138,29 +138,33 @@ define <3 x double> @sin_v3f64(<3 x double> %a) {
 ;
 ; CHECK-GI-LABEL: sin_v3f64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str d10, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    stp d9, d8, [sp, #8] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    str x30, [sp, #24] // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-GI-NEXT:    .cfi_offset w30, -8
-; CHECK-GI-NEXT:    .cfi_offset b8, -16
-; CHECK-GI-NEXT:    .cfi_offset b9, -24
-; CHECK-GI-NEXT:    .cfi_offset b10, -32
+; CHECK-GI-NEXT:    sub sp, sp, #64
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
 ; CHECK-GI-NEXT:    fmov d8, d1
 ; CHECK-GI-NEXT:    fmov d9, d2
 ; CHECK-GI-NEXT:    bl sin
-; CHECK-GI-NEXT:    fmov d10, d0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov d0, d8
 ; CHECK-GI-NEXT:    bl sin
-; CHECK-GI-NEXT:    fmov d8, d0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov d0, d9
 ; CHECK-GI-NEXT:    bl sin
-; CHECK-GI-NEXT:    fmov d1, d8
-; CHECK-GI-NEXT:    ldp d9, d8, [sp, #8] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #24] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q1, q3, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fmov d2, d0
-; CHECK-GI-NEXT:    fmov d0, d10
-; CHECK-GI-NEXT:    ldr d10, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT:    mov d1, v3.d[1]
+; CHECK-GI-NEXT:    fmov d0, d3
+; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call <3 x double> @llvm.sin.v3f64(<3 x double> %a)
@@ -354,7 +358,9 @@ define <3 x float> @sin_v3f32(<3 x float> %a) {
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -725,7 +731,13 @@ define <7 x half> @sin_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1440,29 +1452,33 @@ define <3 x double> @cos_v3f64(<3 x double> %a) {
 ;
 ; CHECK-GI-LABEL: cos_v3f64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str d10, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    stp d9, d8, [sp, #8] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    str x30, [sp, #24] // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-GI-NEXT:    .cfi_offset w30, -8
-; CHECK-GI-NEXT:    .cfi_offset b8, -16
-; CHECK-GI-NEXT:    .cfi_offset b9, -24
-; CHECK-GI-NEXT:    .cfi_offset b10, -32
+; CHECK-GI-NEXT:    sub sp, sp, #64
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
 ; CHECK-GI-NEXT:    fmov d8, d1
 ; CHECK-GI-NEXT:    fmov d9, d2
 ; CHECK-GI-NEXT:    bl cos
-; CHECK-GI-NEXT:    fmov d10, d0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov d0, d8
 ; CHECK-GI-NEXT:    bl cos
-; CHECK-GI-NEXT:    fmov d8, d0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov d0, d9
 ; CHECK-GI-NEXT:    bl cos
-; CHECK-GI-NEXT:    fmov d1, d8
-; CHECK-GI-NEXT:    ldp d9, d8, [sp, #8] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #24] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q1, q3, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fmov d2, d0
-; CHECK-GI-NEXT:    fmov d0, d10
-; CHECK-GI-NEXT:    ldr d10, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT:    mov d1, v3.d[1]
+; CHECK-GI-NEXT:    fmov d0, d3
+; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call <3 x double> @llvm.cos.v3f64(<3 x double> %a)
@@ -1656,7 +1672,9 @@ define <3 x float> @cos_v3f32(<3 x float> %a) {
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -2027,7 +2045,13 @@ define <7 x half> @cos_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/fsqrt.ll b/llvm/test/CodeGen/AArch64/fsqrt.ll
index 6c5fd8e52b017c..15e93e244f1d5c 100644
--- a/llvm/test/CodeGen/AArch64/fsqrt.ll
+++ b/llvm/test/CodeGen/AArch64/fsqrt.ll
@@ -84,6 +84,7 @@ define <3 x double> @sqrt_v3f64(<3 x double> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    fsqrt d2, d2
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    fsqrt v0.2d, v0.2d
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -115,10 +116,21 @@ entry:
 }
 
 define <3 x float> @sqrt_v3f32(<3 x float> %a) {
-; CHECK-LABEL: sqrt_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fsqrt v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sqrt_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fsqrt v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sqrt_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    fsqrt v1.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = call <3 x float> @llvm.sqrt.v3f32(<3 x float> %a)
   ret <3 x float> %c
@@ -195,27 +207,52 @@ define <7 x half> @sqrt_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: sqrt_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    fsqrt v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[3]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    fsqrt v0.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fsqrt v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    fsqrt v2.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[4], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[5], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[6], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[6]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: sqrt_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fsqrt v0.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov v1.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT:    mov v1.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT:    mov v1.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT:    fsqrt v1.8h, v1.8h
+; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.sqrt.v7f16(<7 x half> %a)
diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll
index 61964060ca2c8b..9a49266ace1d9b 100644
--- a/llvm/test/CodeGen/AArch64/icmp.ll
+++ b/llvm/test/CodeGen/AArch64/icmp.ll
@@ -1155,28 +1155,29 @@ define <3 x i64> @v3i64_i64(<3 x i64> %a, <3 x i64> %b, <3 x i64> %d, <3 x i64>
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    // kill: def $d3 killed $d3 def $q3
 ; CHECK-GI-NEXT:    // kill: def $d4 killed $d4 def $q4
-; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-GI-NEXT:    // kill: def $d6 killed $d6 def $q6
-; CHECK-GI-NEXT:    // kill: def $d5 killed $d5 def $q5
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-GI-NEXT:    // kill: def $d7 killed $d7 def $q7
+; CHECK-GI-NEXT:    // kill: def $d5 killed $d5 def $q5
 ; CHECK-GI-NEXT:    ldr x8, [sp]
 ; CHECK-GI-NEXT:    ldr x10, [sp, #24]
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
-; CHECK-GI-NEXT:    cmgt v2.2d, v5.2d, v2.2d
-; CHECK-GI-NEXT:    ldp d1, d4, [sp, #8]
 ; CHECK-GI-NEXT:    mov v6.d[1], v7.d[0]
-; CHECK-GI-NEXT:    fmov x9, d2
+; CHECK-GI-NEXT:    ldp d1, d4, [sp, #8]
+; CHECK-GI-NEXT:    cmgt v2.2d, v5.2d, v2.2d
 ; CHECK-GI-NEXT:    mov v1.d[1], v4.d[0]
 ; CHECK-GI-NEXT:    cmgt v0.2d, v3.2d, v0.2d
+; CHECK-GI-NEXT:    fmov x9, d2
 ; CHECK-GI-NEXT:    sbfx x9, x9, #0, #1
 ; CHECK-GI-NEXT:    bsl v0.16b, v6.16b, v1.16b
 ; CHECK-GI-NEXT:    and x8, x8, x9
 ; CHECK-GI-NEXT:    bic x9, x10, x9
 ; CHECK-GI-NEXT:    orr x8, x8, x9
-; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    mov v2.d[0], x8
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = icmp slt <3 x i64> %a, %b
@@ -1227,22 +1228,37 @@ define <3 x i32> @v3i32_i32(<3 x i32> %a, <3 x i32> %b, <3 x i32> %d, <3 x i32>
 ;
 ; CHECK-GI-LABEL: v3i32_i32:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v4.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v5.s[0], v1.s[0]
 ; CHECK-GI-NEXT:    mov w8, #31 // =0x1f
+; CHECK-GI-NEXT:    mov v6.s[0], w8
 ; CHECK-GI-NEXT:    mov w9, #-1 // =0xffffffff
-; CHECK-GI-NEXT:    cmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NEXT:    mov v4.s[0], w8
-; CHECK-GI-NEXT:    mov v5.s[0], w9
-; CHECK-GI-NEXT:    mov v4.s[1], w8
-; CHECK-GI-NEXT:    mov v5.s[1], w9
-; CHECK-GI-NEXT:    mov v4.s[2], w8
-; CHECK-GI-NEXT:    mov v5.s[2], w9
-; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT:    neg v1.4s, v4.4s
-; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v5.16b
-; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
-; CHECK-GI-NEXT:    and v1.16b, v3.16b, v1.16b
-; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v4.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v5.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v6.s[1], w8
+; CHECK-GI-NEXT:    mov v4.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v5.s[2], v1.s[2]
+; CHECK-GI-NEXT:    mov v0.s[0], w9
+; CHECK-GI-NEXT:    mov v6.s[2], w8
+; CHECK-GI-NEXT:    cmgt v1.4s, v5.4s, v4.4s
+; CHECK-GI-NEXT:    mov v4.s[0], v2.s[0]
+; CHECK-GI-NEXT:    mov v5.s[0], v3.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NEXT:    ushl v1.4s, v1.4s, v6.4s
+; CHECK-GI-NEXT:    neg v6.4s, v6.4s
+; CHECK-GI-NEXT:    mov v4.s[1], v2.s[1]
+; CHECK-GI-NEXT:    mov v5.s[1], v3.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], w9
+; CHECK-GI-NEXT:    sshl v1.4s, v1.4s, v6.4s
+; CHECK-GI-NEXT:    mov v4.s[2], v2.s[2]
+; CHECK-GI-NEXT:    mov v5.s[2], v3.s[2]
+; CHECK-GI-NEXT:    eor v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    and v1.16b, v4.16b, v1.16b
+; CHECK-GI-NEXT:    and v0.16b, v5.16b, v0.16b
+; CHECK-GI-NEXT:    orr v1.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = icmp slt <3 x i32> %a, %b
diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll
index 54ee693db1239f..c67d3b4ee9f410 100644
--- a/llvm/test/CodeGen/AArch64/insertextract.ll
+++ b/llvm/test/CodeGen/AArch64/insertextract.ll
@@ -299,12 +299,18 @@ define <3 x float> @insert_v3f32_c(<3 x float> %a, float %b, i32 %c) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    sub sp, sp, #16
 ; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[0]
 ; CHECK-GI-NEXT:    mov w9, w0
 ; CHECK-GI-NEXT:    mov x8, sp
-; CHECK-GI-NEXT:    str q0, [sp]
 ; CHECK-GI-NEXT:    and x9, x9, #0x3
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT:    str q2, [sp]
 ; CHECK-GI-NEXT:    str s1, [x8, x9, lsl #2]
-; CHECK-GI-NEXT:    ldr q0, [sp], #16
+; CHECK-GI-NEXT:    ldr q1, [sp], #16
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = insertelement <3 x float> %a, float %b, i32 %c
@@ -1019,12 +1025,18 @@ define <3 x i32> @insert_v3i32_c(<3 x i32> %a, i32 %b, i32 %c) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    sub sp, sp, #16
 ; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NEXT:    mov w9, w1
 ; CHECK-GI-NEXT:    mov x8, sp
-; CHECK-GI-NEXT:    str q0, [sp]
 ; CHECK-GI-NEXT:    and x9, x9, #0x3
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    str q1, [sp]
 ; CHECK-GI-NEXT:    str w0, [x8, x9, lsl #2]
-; CHECK-GI-NEXT:    ldr q0, [sp], #16
+; CHECK-GI-NEXT:    ldr q1, [sp], #16
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = insertelement <3 x i32> %a, i32 %b, i32 %c
@@ -1578,10 +1590,13 @@ define float @extract_v3f32_c(<3 x float> %a, i32 %c) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    sub sp, sp, #16
 ; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NEXT:    mov w9, w0
 ; CHECK-GI-NEXT:    mov x8, sp
-; CHECK-GI-NEXT:    str q0, [sp]
 ; CHECK-GI-NEXT:    and x9, x9, #0x3
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    str q1, [sp]
 ; CHECK-GI-NEXT:    ldr s0, [x8, x9, lsl #2]
 ; CHECK-GI-NEXT:    add sp, sp, #16
 ; CHECK-GI-NEXT:    ret
@@ -2272,10 +2287,13 @@ define i32 @extract_v3i32_c(<3 x i32> %a, i32 %c) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    sub sp, sp, #16
 ; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NEXT:    mov w9, w0
 ; CHECK-GI-NEXT:    mov x8, sp
-; CHECK-GI-NEXT:    str q0, [sp]
 ; CHECK-GI-NEXT:    and x9, x9, #0x3
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    str q1, [sp]
 ; CHECK-GI-NEXT:    ldr w0, [x8, x9, lsl #2]
 ; CHECK-GI-NEXT:    add sp, sp, #16
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index 81c1a64f2d434f..caff8c527d34a5 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -1345,18 +1345,16 @@ define <3 x double> @stofp_v3i128_v3f64(<3 x i128> %a) {
 ;
 ; CHECK-GI-LABEL: stofp_v3i128_v3f64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    stp d9, d8, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-GI-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    sub sp, sp, #80
+; CHECK-GI-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 80
 ; CHECK-GI-NEXT:    .cfi_offset w19, -8
 ; CHECK-GI-NEXT:    .cfi_offset w20, -16
 ; CHECK-GI-NEXT:    .cfi_offset w21, -24
 ; CHECK-GI-NEXT:    .cfi_offset w22, -32
 ; CHECK-GI-NEXT:    .cfi_offset w30, -48
-; CHECK-GI-NEXT:    .cfi_offset b8, -56
-; CHECK-GI-NEXT:    .cfi_offset b9, -64
 ; CHECK-GI-NEXT:    mov x19, x2
 ; CHECK-GI-NEXT:    mov x20, x3
 ; CHECK-GI-NEXT:    mov x21, x4
@@ -1364,19 +1362,24 @@ define <3 x double> @stofp_v3i128_v3f64(<3 x i128> %a) {
 ; CHECK-GI-NEXT:    bl __floattidf
 ; CHECK-GI-NEXT:    mov x0, x19
 ; CHECK-GI-NEXT:    mov x1, x20
-; CHECK-GI-NEXT:    fmov d8, d0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    bl __floattidf
 ; CHECK-GI-NEXT:    mov x0, x21
 ; CHECK-GI-NEXT:    mov x1, x22
-; CHECK-GI-NEXT:    fmov d9, d0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    bl __floattidf
-; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q1, q3, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fmov d2, d0
-; CHECK-GI-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    fmov d0, d8
-; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    fmov d1, d9
-; CHECK-GI-NEXT:    ldp d9, d8, [sp], #64 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT:    mov d1, v3.d[1]
+; CHECK-GI-NEXT:    fmov d0, d3
+; CHECK-GI-NEXT:    add sp, sp, #80
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <3 x i128> %a to <3 x double>
@@ -1422,18 +1425,16 @@ define <3 x double> @utofp_v3i128_v3f64(<3 x i128> %a) {
 ;
 ; CHECK-GI-LABEL: utofp_v3i128_v3f64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    stp d9, d8, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-GI-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    sub sp, sp, #80
+; CHECK-GI-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 80
 ; CHECK-GI-NEXT:    .cfi_offset w19, -8
 ; CHECK-GI-NEXT:    .cfi_offset w20, -16
 ; CHECK-GI-NEXT:    .cfi_offset w21, -24
 ; CHECK-GI-NEXT:    .cfi_offset w22, -32
 ; CHECK-GI-NEXT:    .cfi_offset w30, -48
-; CHECK-GI-NEXT:    .cfi_offset b8, -56
-; CHECK-GI-NEXT:    .cfi_offset b9, -64
 ; CHECK-GI-NEXT:    mov x19, x2
 ; CHECK-GI-NEXT:    mov x20, x3
 ; CHECK-GI-NEXT:    mov x21, x4
@@ -1441,19 +1442,24 @@ define <3 x double> @utofp_v3i128_v3f64(<3 x i128> %a) {
 ; CHECK-GI-NEXT:    bl __floatuntidf
 ; CHECK-GI-NEXT:    mov x0, x19
 ; CHECK-GI-NEXT:    mov x1, x20
-; CHECK-GI-NEXT:    fmov d8, d0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    bl __floatuntidf
 ; CHECK-GI-NEXT:    mov x0, x21
 ; CHECK-GI-NEXT:    mov x1, x22
-; CHECK-GI-NEXT:    fmov d9, d0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    bl __floatuntidf
-; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q1, q3, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fmov d2, d0
-; CHECK-GI-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    fmov d0, d8
-; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    fmov d1, d9
-; CHECK-GI-NEXT:    ldp d9, d8, [sp], #64 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT:    mov d1, v3.d[1]
+; CHECK-GI-NEXT:    fmov d0, d3
+; CHECK-GI-NEXT:    add sp, sp, #80
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <3 x i128> %a to <3 x double>
@@ -2009,13 +2015,16 @@ define <3 x double> @stofp_v3i32_v3f64(<3 x i32> %a) {
 ;
 ; CHECK-GI-LABEL: stofp_v3i32_v3f64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.2d, v0.2s, #0
-; CHECK-GI-NEXT:    sshll2 v0.2d, v0.4s, #0
-; CHECK-GI-NEXT:    scvtf v3.2d, v1.2d
-; CHECK-GI-NEXT:    scvtf v2.2d, v0.2d
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[2]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    sshll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll v1.2d, v2.2s, #0
+; CHECK-GI-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-GI-NEXT:    scvtf v2.2d, v1.2d
 ; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
-; CHECK-GI-NEXT:    mov d1, v3.d[1]
-; CHECK-GI-NEXT:    fmov d0, d3
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <3 x i32> %a to <3 x double>
@@ -2037,13 +2046,16 @@ define <3 x double> @utofp_v3i32_v3f64(<3 x i32> %a) {
 ;
 ; CHECK-GI-LABEL: utofp_v3i32_v3f64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-GI-NEXT:    ucvtf v3.2d, v1.2d
-; CHECK-GI-NEXT:    ucvtf v2.2d, v0.2d
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[2]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    ushll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-GI-NEXT:    ucvtf v2.2d, v1.2d
 ; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
-; CHECK-GI-NEXT:    mov d1, v3.d[1]
-; CHECK-GI-NEXT:    fmov d0, d3
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <3 x i32> %a to <3 x double>
@@ -2596,7 +2608,11 @@ define <3 x double> @stofp_v3i16_v3f64(<3 x i16> %a) {
 ;
 ; CHECK-GI-LABEL: stofp_v3i16_v3f64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    sshll v0.2d, v1.2s, #0
 ; CHECK-GI-NEXT:    sshll2 v1.2d, v1.4s, #0
 ; CHECK-GI-NEXT:    scvtf v0.2d, v0.2d
@@ -2626,7 +2642,11 @@ define <3 x double> @utofp_v3i16_v3f64(<3 x i16> %a) {
 ;
 ; CHECK-GI-LABEL: utofp_v3i16_v3f64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    ushll v0.2d, v1.2s, #0
 ; CHECK-GI-NEXT:    ushll2 v1.2d, v1.4s, #0
 ; CHECK-GI-NEXT:    ucvtf v0.2d, v0.2d
@@ -4328,7 +4348,9 @@ define <3 x float> @stofp_v3i128_v3f32(<3 x i128> %a) {
 ; CHECK-GI-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    add sp, sp, #80
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -4412,7 +4434,9 @@ define <3 x float> @utofp_v3i128_v3f32(<3 x i128> %a) {
 ; CHECK-GI-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    add sp, sp, #80
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -4461,13 +4485,16 @@ define <3 x float> @stofp_v3i64_v3f32(<3 x i64> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-GI-NEXT:    scvtf v1.2d, v2.2d
 ; CHECK-GI-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-GI-NEXT:    fcvtn v2.2s, v2.2d
-; CHECK-GI-NEXT:    fcvtn v1.2s, v0.2d
-; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
-; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
+; CHECK-GI-NEXT:    fcvtn v1.2s, v1.2d
+; CHECK-GI-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v2.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[0], v2.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v2.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v2.s[2]
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <3 x i64> %a to <3 x float>
@@ -4493,13 +4520,16 @@ define <3 x float> @utofp_v3i64_v3f32(<3 x i64> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-GI-NEXT:    ucvtf v1.2d, v2.2d
 ; CHECK-GI-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-GI-NEXT:    fcvtn v2.2s, v2.2d
-; CHECK-GI-NEXT:    fcvtn v1.2s, v0.2d
-; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
-; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
+; CHECK-GI-NEXT:    fcvtn v1.2s, v1.2d
+; CHECK-GI-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v2.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[0], v2.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v2.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v2.s[2]
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <3 x i64> %a to <3 x float>
@@ -4831,20 +4861,42 @@ entry:
 }
 
 define <3 x float> @stofp_v3i32_v3f32(<3 x i32> %a) {
-; CHECK-LABEL: stofp_v3i32_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_v3i32_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_v3i32_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    scvtf v1.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <3 x i32> %a to <3 x float>
   ret <3 x float> %c
 }
 
 define <3 x float> @utofp_v3i32_v3f32(<3 x i32> %a) {
-; CHECK-LABEL: utofp_v3i32_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: utofp_v3i32_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_v3i32_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <3 x i32> %a to <3 x float>
   ret <3 x float> %c
@@ -4977,22 +5029,48 @@ entry:
 }
 
 define <3 x float> @stofp_v3i16_v3f32(<3 x i16> %a) {
-; CHECK-LABEL: stofp_v3i16_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_v3i16_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_v3i16_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    sshll v0.4s, v1.4h, #0
+; CHECK-GI-NEXT:    scvtf v1.4s, v0.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <3 x i16> %a to <3 x float>
   ret <3 x float> %c
 }
 
 define <3 x float> @utofp_v3i16_v3f32(<3 x i16> %a) {
-; CHECK-LABEL: utofp_v3i16_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ucvtf v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: utofp_v3i16_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_v3i16_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    ushll v0.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ucvtf v1.4s, v0.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <3 x i16> %a to <3 x float>
   ret <3 x float> %c
@@ -5258,7 +5336,10 @@ define <3 x float> @stofp_v3i8_v3f32(<3 x i8> %a) {
 ; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-GI-NEXT:    scvtf v1.4s, v0.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <3 x i8> %a to <3 x float>
@@ -5288,7 +5369,10 @@ define <3 x float> @utofp_v3i8_v3f32(<3 x i8> %a) {
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-GI-NEXT:    ucvtf v1.4s, v0.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <3 x i8> %a to <3 x float>
@@ -5690,11 +5774,14 @@ define <2 x half> @stofp_v2i128_v2f16(<2 x i128> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov x1, x20
 ; CHECK-GI-NOFP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NOFP16-NEXT:    bl __floattisf
-; CHECK-GI-NOFP16-NEXT:    fcvt h1, s0
-; CHECK-GI-NOFP16-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NOFP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-GI-NOFP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    add sp, sp, #48
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -5721,7 +5808,10 @@ define <2 x half> @stofp_v2i128_v2f16(<2 x i128> %a) {
 ; CHECK-GI-FP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-FP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[0]
-; CHECK-GI-FP16-NEXT:    fmov d0, d1
+; CHECK-GI-FP16-NEXT:    mov h0, v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    add sp, sp, #48
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -5803,11 +5893,14 @@ define <2 x half> @utofp_v2i128_v2f16(<2 x i128> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov x1, x20
 ; CHECK-GI-NOFP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NOFP16-NEXT:    bl __floatuntisf
-; CHECK-GI-NOFP16-NEXT:    fcvt h1, s0
-; CHECK-GI-NOFP16-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NOFP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-GI-NOFP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    add sp, sp, #48
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -5834,7 +5927,10 @@ define <2 x half> @utofp_v2i128_v2f16(<2 x i128> %a) {
 ; CHECK-GI-FP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-FP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[0]
-; CHECK-GI-FP16-NEXT:    fmov d0, d1
+; CHECK-GI-FP16-NEXT:    mov h0, v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    add sp, sp, #48
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -5927,55 +6023,63 @@ define <3 x half> @stofp_v3i128_v3f16(<3 x i128> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: stofp_v3i128_v3f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    sub sp, sp, #80
-; CHECK-GI-NOFP16-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-GI-NOFP16-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NOFP16-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NOFP16-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-GI-NOFP16-NEXT:    stp d9, d8, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    .cfi_def_cfa_offset 64
 ; CHECK-GI-NOFP16-NEXT:    .cfi_offset w19, -8
 ; CHECK-GI-NOFP16-NEXT:    .cfi_offset w20, -16
 ; CHECK-GI-NOFP16-NEXT:    .cfi_offset w21, -24
 ; CHECK-GI-NOFP16-NEXT:    .cfi_offset w22, -32
 ; CHECK-GI-NOFP16-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-NOFP16-NEXT:    .cfi_offset b8, -56
+; CHECK-GI-NOFP16-NEXT:    .cfi_offset b9, -64
 ; CHECK-GI-NOFP16-NEXT:    mov x19, x2
 ; CHECK-GI-NOFP16-NEXT:    mov x20, x3
 ; CHECK-GI-NOFP16-NEXT:    mov x21, x4
 ; CHECK-GI-NOFP16-NEXT:    mov x22, x5
 ; CHECK-GI-NOFP16-NEXT:    bl __floattisf
-; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-GI-NOFP16-NEXT:    mov x0, x19
 ; CHECK-GI-NOFP16-NEXT:    mov x1, x20
-; CHECK-GI-NOFP16-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    fcvt h8, s0
 ; CHECK-GI-NOFP16-NEXT:    bl __floattisf
-; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-GI-NOFP16-NEXT:    mov x0, x21
 ; CHECK-GI-NOFP16-NEXT:    mov x1, x22
-; CHECK-GI-NOFP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    fcvt h9, s0
 ; CHECK-GI-NOFP16-NEXT:    bl __floattisf
-; CHECK-GI-NOFP16-NEXT:    ldp q2, q1, [sp] // 32-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    fmov w8, s8
 ; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-GI-NOFP16-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.16b, v1.16b
-; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NOFP16-NEXT:    add sp, sp, #80
+; CHECK-GI-NOFP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], w8
+; CHECK-GI-NOFP16-NEXT:    fmov w8, s9
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NOFP16-NEXT:    fmov w8, s0
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NOFP16-NEXT:    mov w8, v1.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov w9, v1.s[2]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], w9
+; CHECK-GI-NOFP16-NEXT:    fmov d0, d1
+; CHECK-GI-NOFP16-NEXT:    ldp d9, d8, [sp], #64 // 16-byte Folded Reload
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: stofp_v3i128_v3f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    sub sp, sp, #80
-; CHECK-GI-FP16-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-GI-FP16-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-FP16-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-FP16-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-GI-FP16-NEXT:    stp d9, d8, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    .cfi_def_cfa_offset 64
 ; CHECK-GI-FP16-NEXT:    .cfi_offset w19, -8
 ; CHECK-GI-FP16-NEXT:    .cfi_offset w20, -16
 ; CHECK-GI-FP16-NEXT:    .cfi_offset w21, -24
 ; CHECK-GI-FP16-NEXT:    .cfi_offset w22, -32
 ; CHECK-GI-FP16-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-FP16-NEXT:    .cfi_offset b8, -56
+; CHECK-GI-FP16-NEXT:    .cfi_offset b9, -64
 ; CHECK-GI-FP16-NEXT:    mov x19, x2
 ; CHECK-GI-FP16-NEXT:    mov x20, x3
 ; CHECK-GI-FP16-NEXT:    mov x21, x4
@@ -5983,24 +6087,28 @@ define <3 x half> @stofp_v3i128_v3f16(<3 x i128> %a) {
 ; CHECK-GI-FP16-NEXT:    bl __floattihf
 ; CHECK-GI-FP16-NEXT:    mov x0, x19
 ; CHECK-GI-FP16-NEXT:    mov x1, x20
-; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
-; CHECK-GI-FP16-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    fmov s8, s0
 ; CHECK-GI-FP16-NEXT:    bl __floattihf
 ; CHECK-GI-FP16-NEXT:    mov x0, x21
 ; CHECK-GI-FP16-NEXT:    mov x1, x22
-; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
-; CHECK-GI-FP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    fmov s9, s0
 ; CHECK-GI-FP16-NEXT:    bl __floattihf
-; CHECK-GI-FP16-NEXT:    ldp q2, q1, [sp] // 32-byte Folded Reload
-; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
-; CHECK-GI-FP16-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-GI-FP16-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-GI-FP16-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-FP16-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.16b, v1.16b
-; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-FP16-NEXT:    add sp, sp, #80
+; CHECK-GI-FP16-NEXT:    fmov w8, s8
+; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-GI-FP16-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    mov v1.s[0], w8
+; CHECK-GI-FP16-NEXT:    fmov w8, s9
+; CHECK-GI-FP16-NEXT:    mov v1.s[1], w8
+; CHECK-GI-FP16-NEXT:    fmov w8, s0
+; CHECK-GI-FP16-NEXT:    mov v1.s[2], w8
+; CHECK-GI-FP16-NEXT:    mov w8, v1.s[1]
+; CHECK-GI-FP16-NEXT:    mov w9, v1.s[2]
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], w8
+; CHECK-GI-FP16-NEXT:    mov v1.h[2], w9
+; CHECK-GI-FP16-NEXT:    fmov d0, d1
+; CHECK-GI-FP16-NEXT:    ldp d9, d8, [sp], #64 // 16-byte Folded Reload
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = sitofp <3 x i128> %a to <3 x half>
@@ -6092,55 +6200,63 @@ define <3 x half> @utofp_v3i128_v3f16(<3 x i128> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: utofp_v3i128_v3f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    sub sp, sp, #80
-; CHECK-GI-NOFP16-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-GI-NOFP16-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NOFP16-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NOFP16-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-GI-NOFP16-NEXT:    stp d9, d8, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    .cfi_def_cfa_offset 64
 ; CHECK-GI-NOFP16-NEXT:    .cfi_offset w19, -8
 ; CHECK-GI-NOFP16-NEXT:    .cfi_offset w20, -16
 ; CHECK-GI-NOFP16-NEXT:    .cfi_offset w21, -24
 ; CHECK-GI-NOFP16-NEXT:    .cfi_offset w22, -32
 ; CHECK-GI-NOFP16-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-NOFP16-NEXT:    .cfi_offset b8, -56
+; CHECK-GI-NOFP16-NEXT:    .cfi_offset b9, -64
 ; CHECK-GI-NOFP16-NEXT:    mov x19, x2
 ; CHECK-GI-NOFP16-NEXT:    mov x20, x3
 ; CHECK-GI-NOFP16-NEXT:    mov x21, x4
 ; CHECK-GI-NOFP16-NEXT:    mov x22, x5
 ; CHECK-GI-NOFP16-NEXT:    bl __floatuntisf
-; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-GI-NOFP16-NEXT:    mov x0, x19
 ; CHECK-GI-NOFP16-NEXT:    mov x1, x20
-; CHECK-GI-NOFP16-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    fcvt h8, s0
 ; CHECK-GI-NOFP16-NEXT:    bl __floatuntisf
-; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-GI-NOFP16-NEXT:    mov x0, x21
 ; CHECK-GI-NOFP16-NEXT:    mov x1, x22
-; CHECK-GI-NOFP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    fcvt h9, s0
 ; CHECK-GI-NOFP16-NEXT:    bl __floatuntisf
-; CHECK-GI-NOFP16-NEXT:    ldp q2, q1, [sp] // 32-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    fmov w8, s8
 ; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-GI-NOFP16-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.16b, v1.16b
-; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NOFP16-NEXT:    add sp, sp, #80
+; CHECK-GI-NOFP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], w8
+; CHECK-GI-NOFP16-NEXT:    fmov w8, s9
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NOFP16-NEXT:    fmov w8, s0
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NOFP16-NEXT:    mov w8, v1.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov w9, v1.s[2]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], w9
+; CHECK-GI-NOFP16-NEXT:    fmov d0, d1
+; CHECK-GI-NOFP16-NEXT:    ldp d9, d8, [sp], #64 // 16-byte Folded Reload
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: utofp_v3i128_v3f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    sub sp, sp, #80
-; CHECK-GI-FP16-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-GI-FP16-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-FP16-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-FP16-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-GI-FP16-NEXT:    stp d9, d8, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    .cfi_def_cfa_offset 64
 ; CHECK-GI-FP16-NEXT:    .cfi_offset w19, -8
 ; CHECK-GI-FP16-NEXT:    .cfi_offset w20, -16
 ; CHECK-GI-FP16-NEXT:    .cfi_offset w21, -24
 ; CHECK-GI-FP16-NEXT:    .cfi_offset w22, -32
 ; CHECK-GI-FP16-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-FP16-NEXT:    .cfi_offset b8, -56
+; CHECK-GI-FP16-NEXT:    .cfi_offset b9, -64
 ; CHECK-GI-FP16-NEXT:    mov x19, x2
 ; CHECK-GI-FP16-NEXT:    mov x20, x3
 ; CHECK-GI-FP16-NEXT:    mov x21, x4
@@ -6148,24 +6264,28 @@ define <3 x half> @utofp_v3i128_v3f16(<3 x i128> %a) {
 ; CHECK-GI-FP16-NEXT:    bl __floatuntihf
 ; CHECK-GI-FP16-NEXT:    mov x0, x19
 ; CHECK-GI-FP16-NEXT:    mov x1, x20
-; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
-; CHECK-GI-FP16-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    fmov s8, s0
 ; CHECK-GI-FP16-NEXT:    bl __floatuntihf
 ; CHECK-GI-FP16-NEXT:    mov x0, x21
 ; CHECK-GI-FP16-NEXT:    mov x1, x22
-; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
-; CHECK-GI-FP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    fmov s9, s0
 ; CHECK-GI-FP16-NEXT:    bl __floatuntihf
-; CHECK-GI-FP16-NEXT:    ldp q2, q1, [sp] // 32-byte Folded Reload
-; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
-; CHECK-GI-FP16-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-GI-FP16-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-GI-FP16-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-FP16-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.16b, v1.16b
-; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-FP16-NEXT:    add sp, sp, #80
+; CHECK-GI-FP16-NEXT:    fmov w8, s8
+; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-GI-FP16-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    mov v1.s[0], w8
+; CHECK-GI-FP16-NEXT:    fmov w8, s9
+; CHECK-GI-FP16-NEXT:    mov v1.s[1], w8
+; CHECK-GI-FP16-NEXT:    fmov w8, s0
+; CHECK-GI-FP16-NEXT:    mov v1.s[2], w8
+; CHECK-GI-FP16-NEXT:    mov w8, v1.s[1]
+; CHECK-GI-FP16-NEXT:    mov w9, v1.s[2]
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], w8
+; CHECK-GI-FP16-NEXT:    mov v1.h[2], w9
+; CHECK-GI-FP16-NEXT:    fmov d0, d1
+; CHECK-GI-FP16-NEXT:    ldp d9, d8, [sp], #64 // 16-byte Folded Reload
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = uitofp <3 x i128> %a to <3 x half>
@@ -6202,6 +6322,9 @@ define <2 x half> @stofp_v2i64_v2f16(<2 x i64> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: stofp_v2i64_v2f16:
@@ -6211,6 +6334,8 @@ define <2 x half> @stofp_v2i64_v2f16(<2 x i64> %a) {
 ; CHECK-GI-FP16-NEXT:    fcvt h0, d0
 ; CHECK-GI-FP16-NEXT:    fcvt h1, d1
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -6248,6 +6373,9 @@ define <2 x half> @utofp_v2i64_v2f16(<2 x i64> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: utofp_v2i64_v2f16:
@@ -6257,6 +6385,8 @@ define <2 x half> @utofp_v2i64_v2f16(<2 x i64> %a) {
 ; CHECK-GI-FP16-NEXT:    fcvt h0, d0
 ; CHECK-GI-FP16-NEXT:    fcvt h1, d1
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -6288,7 +6418,18 @@ define <3 x half> @stofp_v3i64_v3f16(<3 x i64> %a) {
 ; CHECK-GI-NOFP16-NEXT:    scvtf v0.2d, v0.2d
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.2s, v0.2d
 ; CHECK-GI-NOFP16-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    umov w8, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    umov w9, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NOFP16-NEXT:    umov w8, v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NOFP16-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NOFP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: stofp_v3i64_v3f16:
@@ -6303,8 +6444,16 @@ define <3 x half> @stofp_v3i64_v3f16(<3 x i64> %a) {
 ; CHECK-GI-FP16-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-FP16-NEXT:    fcvt h0, d0
 ; CHECK-GI-FP16-NEXT:    fcvt h1, d1
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-GI-FP16-NEXT:    fmov w8, s0
+; CHECK-GI-FP16-NEXT:    mov v0.s[0], w8
+; CHECK-GI-FP16-NEXT:    fmov w8, s1
+; CHECK-GI-FP16-NEXT:    mov v0.s[1], w8
+; CHECK-GI-FP16-NEXT:    fmov w8, s2
+; CHECK-GI-FP16-NEXT:    mov v0.s[2], w8
+; CHECK-GI-FP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-FP16-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], w8
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], w9
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -6336,7 +6485,18 @@ define <3 x half> @utofp_v3i64_v3f16(<3 x i64> %a) {
 ; CHECK-GI-NOFP16-NEXT:    ucvtf v0.2d, v0.2d
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.2s, v0.2d
 ; CHECK-GI-NOFP16-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    umov w8, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    umov w9, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NOFP16-NEXT:    umov w8, v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NOFP16-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NOFP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: utofp_v3i64_v3f16:
@@ -6351,8 +6511,16 @@ define <3 x half> @utofp_v3i64_v3f16(<3 x i64> %a) {
 ; CHECK-GI-FP16-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-FP16-NEXT:    fcvt h0, d0
 ; CHECK-GI-FP16-NEXT:    fcvt h1, d1
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-GI-FP16-NEXT:    fmov w8, s0
+; CHECK-GI-FP16-NEXT:    mov v0.s[0], w8
+; CHECK-GI-FP16-NEXT:    fmov w8, s1
+; CHECK-GI-FP16-NEXT:    mov v0.s[1], w8
+; CHECK-GI-FP16-NEXT:    fmov w8, s2
+; CHECK-GI-FP16-NEXT:    mov v0.s[2], w8
+; CHECK-GI-FP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-FP16-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], w8
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], w9
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -7184,6 +7352,9 @@ define <2 x half> @stofp_v2i32_v2f16(<2 x i32> %a) {
 ; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <2 x i32> %a to <2 x half>
@@ -7204,6 +7375,9 @@ define <2 x half> @utofp_v2i32_v2f16(<2 x i32> %a) {
 ; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <2 x i32> %a to <2 x half>
@@ -7211,22 +7385,62 @@ entry:
 }
 
 define <3 x half> @stofp_v3i32_v3f16(<3 x i32> %a) {
-; CHECK-LABEL: stofp_v3i32_v3f16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_v3i32_v3f16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_v3i32_v3f16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    scvtf v0.4s, v1.4s
+; CHECK-GI-NEXT:    fcvtn v1.4h, v0.4s
+; CHECK-GI-NEXT:    umov w8, v1.h[0]
+; CHECK-GI-NEXT:    umov w9, v1.h[1]
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    umov w8, v1.h[2]
+; CHECK-GI-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <3 x i32> %a to <3 x half>
   ret <3 x half> %c
 }
 
 define <3 x half> @utofp_v3i32_v3f16(<3 x i32> %a) {
-; CHECK-LABEL: utofp_v3i32_v3f16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v0.4s, v0.4s
-; CHECK-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: utofp_v3i32_v3f16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_v3i32_v3f16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    ucvtf v0.4s, v1.4s
+; CHECK-GI-NEXT:    fcvtn v1.4h, v0.4s
+; CHECK-GI-NEXT:    umov w8, v1.h[0]
+; CHECK-GI-NEXT:    umov w9, v1.h[1]
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    umov w8, v1.h[2]
+; CHECK-GI-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <3 x i32> %a to <3 x half>
   ret <3 x half> %c
@@ -7411,12 +7625,18 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: stofp_v2i16_v2f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    uzp1 v0.4h, v0.4h, v0.4h
 ; CHECK-GI-FP16-NEXT:    scvtf v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = sitofp <2 x i16> %a to <2 x half>
@@ -7446,12 +7666,18 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: utofp_v2i16_v2f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    uzp1 v0.4h, v0.4h, v0.4h
 ; CHECK-GI-FP16-NEXT:    ucvtf v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = uitofp <2 x i16> %a to <2 x half>
@@ -7473,14 +7699,44 @@ define <3 x half> @stofp_v3i16_v3f16(<3 x i16> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: stofp_v3i16_v3f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    sshll v0.4s, v1.4h, #0
 ; CHECK-GI-NOFP16-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    umov w8, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    umov w9, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NOFP16-NEXT:    umov w8, v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NOFP16-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NOFP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: stofp_v3i16_v3f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    scvtf v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-FP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT:    scvtf v1.4h, v1.4h
+; CHECK-GI-FP16-NEXT:    umov w8, v1.h[0]
+; CHECK-GI-FP16-NEXT:    umov w9, v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.s[0], w8
+; CHECK-GI-FP16-NEXT:    umov w8, v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v0.s[1], w9
+; CHECK-GI-FP16-NEXT:    mov v0.s[2], w8
+; CHECK-GI-FP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-FP16-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], w8
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], w9
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = sitofp <3 x i16> %a to <3 x half>
@@ -7502,14 +7758,44 @@ define <3 x half> @utofp_v3i16_v3f16(<3 x i16> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: utofp_v3i16_v3f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    ushll v0.4s, v1.4h, #0
 ; CHECK-GI-NOFP16-NEXT:    ucvtf v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    umov w8, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    umov w9, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NOFP16-NEXT:    umov w8, v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NOFP16-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NOFP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: utofp_v3i16_v3f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    ucvtf v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-FP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT:    ucvtf v1.4h, v1.4h
+; CHECK-GI-FP16-NEXT:    umov w8, v1.h[0]
+; CHECK-GI-FP16-NEXT:    umov w9, v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.s[0], w8
+; CHECK-GI-FP16-NEXT:    umov w8, v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v0.s[1], w9
+; CHECK-GI-FP16-NEXT:    mov v0.s[2], w8
+; CHECK-GI-FP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-FP16-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], w8
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], w9
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = uitofp <3 x i16> %a to <3 x half>
@@ -7933,6 +8219,9 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: stofp_v2i8_v2f16:
@@ -7941,6 +8230,9 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-FP16-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-GI-FP16-NEXT:    sshr v0.4h, v0.4h, #8
 ; CHECK-GI-FP16-NEXT:    scvtf v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = sitofp <2 x i8> %a to <2 x half>
@@ -7984,6 +8276,9 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: utofp_v2i8_v2f16:
@@ -7992,6 +8287,9 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-FP16-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-FP16-NEXT:    uzp1 v0.4h, v0.4h, v0.4h
 ; CHECK-GI-FP16-NEXT:    ucvtf v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = uitofp <2 x i8> %a to <2 x half>
@@ -8034,7 +8332,18 @@ define <3 x half> @stofp_v3i8_v3f16(<3 x i8> %a) {
 ; CHECK-GI-NOFP16-NEXT:    sshll v1.4s, v1.4h, #0
 ; CHECK-GI-NOFP16-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NOFP16-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    umov w8, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    umov w9, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NOFP16-NEXT:    umov w8, v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NOFP16-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NOFP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: stofp_v3i8_v3f16:
@@ -8043,7 +8352,18 @@ define <3 x half> @stofp_v3i8_v3f16(<3 x i8> %a) {
 ; CHECK-GI-FP16-NEXT:    mov v0.b[1], w1
 ; CHECK-GI-FP16-NEXT:    mov v0.b[2], w2
 ; CHECK-GI-FP16-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-FP16-NEXT:    scvtf v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT:    scvtf v1.4h, v0.4h
+; CHECK-GI-FP16-NEXT:    umov w8, v1.h[0]
+; CHECK-GI-FP16-NEXT:    umov w9, v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.s[0], w8
+; CHECK-GI-FP16-NEXT:    umov w8, v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v0.s[1], w9
+; CHECK-GI-FP16-NEXT:    mov v0.s[2], w8
+; CHECK-GI-FP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-FP16-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], w8
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], w9
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = sitofp <3 x i8> %a to <3 x half>
@@ -8084,7 +8404,18 @@ define <3 x half> @utofp_v3i8_v3f16(<3 x i8> %a) {
 ; CHECK-GI-NOFP16-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NOFP16-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NOFP16-NEXT:    ucvtf v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    umov w8, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    umov w9, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NOFP16-NEXT:    umov w8, v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NOFP16-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NOFP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: utofp_v3i8_v3f16:
@@ -8093,7 +8424,18 @@ define <3 x half> @utofp_v3i8_v3f16(<3 x i8> %a) {
 ; CHECK-GI-FP16-NEXT:    mov v0.b[1], w1
 ; CHECK-GI-FP16-NEXT:    mov v0.b[2], w2
 ; CHECK-GI-FP16-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-FP16-NEXT:    ucvtf v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT:    ucvtf v1.4h, v0.4h
+; CHECK-GI-FP16-NEXT:    umov w8, v1.h[0]
+; CHECK-GI-FP16-NEXT:    umov w9, v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.s[0], w8
+; CHECK-GI-FP16-NEXT:    umov w8, v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v0.s[1], w9
+; CHECK-GI-FP16-NEXT:    mov v0.s[2], w8
+; CHECK-GI-FP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-FP16-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], w8
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], w9
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = uitofp <3 x i8> %a to <3 x half>
diff --git a/llvm/test/CodeGen/AArch64/llvm.exp10.ll b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
index c1ea891bc86e7e..33e8a85784d139 100644
--- a/llvm/test/CodeGen/AArch64/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
@@ -109,11 +109,14 @@ define <2 x half> @exp10_v2f16(<2 x half> %x) {
 ; GISEL-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; GISEL-NEXT:    fmov s0, s1
 ; GISEL-NEXT:    bl exp10f
-; GISEL-NEXT:    fcvt h1, s0
-; GISEL-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; GISEL-NEXT:    fcvt h0, s0
+; GISEL-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr x30, [sp, #24] // 8-byte Folded Reload
 ; GISEL-NEXT:    ldr d8, [sp, #16] // 8-byte Folded Reload
-; GISEL-NEXT:    mov v0.h[1], v1.h[0]
+; GISEL-NEXT:    mov v1.h[1], v0.h[0]
+; GISEL-NEXT:    mov h0, v1.h[1]
+; GISEL-NEXT:    mov v1.h[1], v0.h[0]
+; GISEL-NEXT:    mov v0.16b, v1.16b
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; GISEL-NEXT:    add sp, sp, #32
 ; GISEL-NEXT:    ret
@@ -165,10 +168,9 @@ define <3 x half> @exp10_v3f16(<3 x half> %x) {
 ;
 ; GISEL-LABEL: exp10_v3f16:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    sub sp, sp, #64
-; GISEL-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
-; GISEL-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
-; GISEL-NEXT:    .cfi_def_cfa_offset 64
+; GISEL-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; GISEL-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 32
 ; GISEL-NEXT:    .cfi_offset w30, -16
 ; GISEL-NEXT:    .cfi_offset b8, -24
 ; GISEL-NEXT:    .cfi_offset b9, -32
@@ -178,24 +180,27 @@ define <3 x half> @exp10_v3f16(<3 x half> %x) {
 ; GISEL-NEXT:    fcvt s0, h0
 ; GISEL-NEXT:    bl exp10f
 ; GISEL-NEXT:    fcvt s1, h8
-; GISEL-NEXT:    fcvt h0, s0
-; GISEL-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; GISEL-NEXT:    fcvt h8, s0
 ; GISEL-NEXT:    fmov s0, s1
 ; GISEL-NEXT:    bl exp10f
 ; GISEL-NEXT:    fcvt s1, h9
-; GISEL-NEXT:    fcvt h0, s0
-; GISEL-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; GISEL-NEXT:    fcvt h9, s0
 ; GISEL-NEXT:    fmov s0, s1
 ; GISEL-NEXT:    bl exp10f
-; GISEL-NEXT:    ldp q2, q1, [sp] // 32-byte Folded Reload
+; GISEL-NEXT:    fmov w8, s8
 ; GISEL-NEXT:    fcvt h0, s0
-; GISEL-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
-; GISEL-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
-; GISEL-NEXT:    mov v1.h[1], v2.h[0]
-; GISEL-NEXT:    mov v1.h[2], v0.h[0]
-; GISEL-NEXT:    mov v0.16b, v1.16b
-; GISEL-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; GISEL-NEXT:    add sp, sp, #64
+; GISEL-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; GISEL-NEXT:    mov v1.s[0], w8
+; GISEL-NEXT:    fmov w8, s9
+; GISEL-NEXT:    mov v1.s[1], w8
+; GISEL-NEXT:    fmov w8, s0
+; GISEL-NEXT:    mov v1.s[2], w8
+; GISEL-NEXT:    mov w8, v1.s[1]
+; GISEL-NEXT:    mov w9, v1.s[2]
+; GISEL-NEXT:    mov v1.h[1], w8
+; GISEL-NEXT:    mov v1.h[2], w9
+; GISEL-NEXT:    fmov d0, d1
+; GISEL-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
 ; GISEL-NEXT:    ret
   %r = call <3 x half> @llvm.exp10.v3f16(<3 x half> %x)
   ret <3 x half> %r
@@ -436,7 +441,9 @@ define <3 x float> @exp10_v3f32(<3 x float> %x) {
 ; GISEL-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; GISEL-NEXT:    mov v1.s[1], v2.s[0]
 ; GISEL-NEXT:    mov v1.s[2], v0.s[0]
-; GISEL-NEXT:    mov v0.16b, v1.16b
+; GISEL-NEXT:    mov v0.s[0], v1.s[0]
+; GISEL-NEXT:    mov v0.s[1], v1.s[1]
+; GISEL-NEXT:    mov v0.s[2], v1.s[2]
 ; GISEL-NEXT:    add sp, sp, #64
 ; GISEL-NEXT:    ret
   %r = call <3 x float> @llvm.exp10.v3f32(<3 x float> %x)
@@ -624,29 +631,33 @@ define <3 x double> @exp10_v3f64(<3 x double> %x) {
 ;
 ; GISEL-LABEL: exp10_v3f64:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    str d10, [sp, #-32]! // 8-byte Folded Spill
-; GISEL-NEXT:    stp d9, d8, [sp, #8] // 16-byte Folded Spill
-; GISEL-NEXT:    str x30, [sp, #24] // 8-byte Folded Spill
-; GISEL-NEXT:    .cfi_def_cfa_offset 32
-; GISEL-NEXT:    .cfi_offset w30, -8
-; GISEL-NEXT:    .cfi_offset b8, -16
-; GISEL-NEXT:    .cfi_offset b9, -24
-; GISEL-NEXT:    .cfi_offset b10, -32
+; GISEL-NEXT:    sub sp, sp, #64
+; GISEL-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; GISEL-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 64
+; GISEL-NEXT:    .cfi_offset w30, -16
+; GISEL-NEXT:    .cfi_offset b8, -24
+; GISEL-NEXT:    .cfi_offset b9, -32
 ; GISEL-NEXT:    fmov d8, d1
 ; GISEL-NEXT:    fmov d9, d2
 ; GISEL-NEXT:    bl exp10
-; GISEL-NEXT:    fmov d10, d0
+; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
+; GISEL-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; GISEL-NEXT:    fmov d0, d8
 ; GISEL-NEXT:    bl exp10
-; GISEL-NEXT:    fmov d8, d0
+; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
+; GISEL-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; GISEL-NEXT:    fmov d0, d9
 ; GISEL-NEXT:    bl exp10
-; GISEL-NEXT:    fmov d1, d8
-; GISEL-NEXT:    ldp d9, d8, [sp, #8] // 16-byte Folded Reload
-; GISEL-NEXT:    ldr x30, [sp, #24] // 8-byte Folded Reload
+; GISEL-NEXT:    ldp q1, q3, [sp] // 32-byte Folded Reload
 ; GISEL-NEXT:    fmov d2, d0
-; GISEL-NEXT:    fmov d0, d10
-; GISEL-NEXT:    ldr d10, [sp], #32 // 8-byte Folded Reload
+; GISEL-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; GISEL-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; GISEL-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; GISEL-NEXT:    mov v3.d[1], v1.d[0]
+; GISEL-NEXT:    mov d1, v3.d[1]
+; GISEL-NEXT:    fmov d0, d3
+; GISEL-NEXT:    add sp, sp, #64
 ; GISEL-NEXT:    ret
   %r = call <3 x double> @llvm.exp10.v3f64(<3 x double> %x)
   ret <3 x double> %r
diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll
index 70ab10e716875a..517cf7c4352fd3 100644
--- a/llvm/test/CodeGen/AArch64/load.ll
+++ b/llvm/test/CodeGen/AArch64/load.ll
@@ -215,10 +215,16 @@ define <3 x i8> @load_v3i8(ptr %ptr){
 ;
 ; CHECK-GI-LABEL: load_v3i8:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w1, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w2, [x0, #2]
-; CHECK-GI-NEXT:    mov w0, w8
+; CHECK-GI-NEXT:    ldr b0, [x0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #1]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #2]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    fmov w1, s1
+; CHECK-GI-NEXT:    fmov w2, s2
 ; CHECK-GI-NEXT:    ret
     %a = load <3 x i8>, ptr %ptr
     ret <3 x i8> %a
@@ -232,20 +238,38 @@ define <7 x i8> @load_v7i8(ptr %ptr){
 ;
 ; CHECK-GI-LABEL: load_v7i8:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr b0, [x0]
-; CHECK-GI-NEXT:    ldr b1, [x0, #1]
-; CHECK-GI-NEXT:    mov v0.b[0], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT:    ldr b1, [x0, #2]
-; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
-; CHECK-GI-NEXT:    ldr b1, [x0, #3]
-; CHECK-GI-NEXT:    mov v0.b[3], v1.b[0]
-; CHECK-GI-NEXT:    ldr b1, [x0, #4]
-; CHECK-GI-NEXT:    mov v0.b[4], v1.b[0]
-; CHECK-GI-NEXT:    ldr b1, [x0, #5]
-; CHECK-GI-NEXT:    mov v0.b[5], v1.b[0]
-; CHECK-GI-NEXT:    ldr b1, [x0, #6]
-; CHECK-GI-NEXT:    mov v0.b[6], v1.b[0]
+; CHECK-GI-NEXT:    ldrb w8, [x0]
+; CHECK-GI-NEXT:    ldrb w9, [x0, #1]
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
+; CHECK-GI-NEXT:    mov v0.h[1], w9
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    ldrb w8, [x0, #3]
+; CHECK-GI-NEXT:    mov v0.h[3], w8
+; CHECK-GI-NEXT:    ldrb w8, [x0, #4]
+; CHECK-GI-NEXT:    mov v0.h[4], w8
+; CHECK-GI-NEXT:    ldrb w8, [x0, #5]
+; CHECK-GI-NEXT:    mov v0.h[5], w8
+; CHECK-GI-NEXT:    ldrb w8, [x0, #6]
+; CHECK-GI-NEXT:    mov v0.h[6], w8
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    mov h2, v0.h[3]
+; CHECK-GI-NEXT:    mov h3, v0.h[4]
+; CHECK-GI-NEXT:    mov h4, v0.h[5]
+; CHECK-GI-NEXT:    mov h5, v0.h[6]
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov h1, v0.h[2]
+; CHECK-GI-NEXT:    mov v0.b[1], w8
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.b[2], w8
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov v0.b[3], w8
+; CHECK-GI-NEXT:    fmov w8, s3
+; CHECK-GI-NEXT:    mov v0.b[4], w8
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    mov v0.b[5], w8
+; CHECK-GI-NEXT:    fmov w8, s5
+; CHECK-GI-NEXT:    mov v0.b[6], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %a = load <7 x i8>, ptr %ptr
@@ -261,10 +285,14 @@ define <3 x i16> @load_v3i16(ptr %ptr){
 ; CHECK-GI-LABEL: load_v3i16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT:    add x8, x0, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #4]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %a = load <3 x i16>, ptr %ptr
@@ -279,19 +307,26 @@ define <7 x i16> @load_v7i16(ptr %ptr){
 ;
 ; CHECK-GI-LABEL: load_v7i16:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ldr h1, [x0]
+; CHECK-GI-NEXT:    ldr h0, [x0, #2]
 ; CHECK-GI-NEXT:    add x8, x0, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[0]
+; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x8]
 ; CHECK-GI-NEXT:    add x8, x0, #6
-; CHECK-GI-NEXT:    ld1 { v0.h }[3], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[3], [x8]
 ; CHECK-GI-NEXT:    add x8, x0, #8
-; CHECK-GI-NEXT:    ld1 { v0.h }[4], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[4], [x8]
 ; CHECK-GI-NEXT:    add x8, x0, #10
-; CHECK-GI-NEXT:    ld1 { v0.h }[5], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[5], [x8]
 ; CHECK-GI-NEXT:    add x8, x0, #12
-; CHECK-GI-NEXT:    ld1 { v0.h }[6], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[6], [x8]
+; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
 ; CHECK-GI-NEXT:    ret
     %a = load <7 x i16>, ptr %ptr
     ret <7 x i16> %a
@@ -305,11 +340,14 @@ define <3 x i32> @load_v3i32(ptr %ptr){
 ;
 ; CHECK-GI-LABEL: load_v3i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr s0, [x0]
+; CHECK-GI-NEXT:    ldr s1, [x0]
 ; CHECK-GI-NEXT:    add x8, x0, #4
-; CHECK-GI-NEXT:    ld1 { v0.s }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.s }[1], [x8]
 ; CHECK-GI-NEXT:    add x8, x0, #8
-; CHECK-GI-NEXT:    ld1 { v0.s }[2], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.s }[2], [x8]
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    ret
     %a = load <3 x i32>, ptr %ptr
     ret <3 x i32> %a
diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll
index 9ca975d9e742e1..9735354402aabf 100644
--- a/llvm/test/CodeGen/AArch64/mul.ll
+++ b/llvm/test/CodeGen/AArch64/mul.ll
@@ -355,10 +355,24 @@ entry:
 }
 
 define <3 x i32> @v3i32(<3 x i32> %d, <3 x i32> %e) {
-; CHECK-LABEL: v3i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v3i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v3i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT:    mul v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %s = mul <3 x i32> %d, %e
   ret <3 x i32> %s
@@ -457,14 +471,15 @@ define <3 x i64> @v3i64(<3 x i64> %d, <3 x i64> %e) {
 ; CHECK-GI-NEXT:    mov x11, v3.d[1]
 ; CHECK-GI-NEXT:    mul x8, x8, x9
 ; CHECK-GI-NEXT:    mul x9, x10, x11
+; CHECK-GI-NEXT:    fmov x10, d5
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
 ; CHECK-GI-NEXT:    fmov x8, d2
+; CHECK-GI-NEXT:    mul x8, x8, x10
 ; CHECK-GI-NEXT:    mov v0.d[1], x9
-; CHECK-GI-NEXT:    fmov x9, d5
-; CHECK-GI-NEXT:    mul x8, x8, x9
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    mov v2.d[0], x8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = mul <3 x i64> %d, %e
diff --git a/llvm/test/CodeGen/AArch64/rem.ll b/llvm/test/CodeGen/AArch64/rem.ll
index d807635f5d87d1..ad83cc81720728 100644
--- a/llvm/test/CodeGen/AArch64/rem.ll
+++ b/llvm/test/CodeGen/AArch64/rem.ll
@@ -227,10 +227,18 @@ define <3 x i8> @sv3i8(<3 x i8> %d, <3 x i8> %e) {
 ; CHECK-GI-NEXT:    sxtb w15, w5
 ; CHECK-GI-NEXT:    sdiv w10, w8, w9
 ; CHECK-GI-NEXT:    sdiv w13, w11, w12
-; CHECK-GI-NEXT:    msub w0, w10, w9, w8
-; CHECK-GI-NEXT:    sdiv w16, w14, w15
-; CHECK-GI-NEXT:    msub w1, w13, w12, w11
-; CHECK-GI-NEXT:    msub w2, w16, w15, w14
+; CHECK-GI-NEXT:    msub w8, w10, w9, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    sdiv w9, w14, w15
+; CHECK-GI-NEXT:    msub w8, w13, w12, w11
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    msub w8, w9, w15, w14
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    fmov w1, s1
+; CHECK-GI-NEXT:    fmov w2, s2
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <3 x i8> %d, %e
@@ -1141,15 +1149,23 @@ define <3 x i8> @uv3i8(<3 x i8> %d, <3 x i8> %e) {
 ; CHECK-GI-NEXT:    and w8, w0, #0xff
 ; CHECK-GI-NEXT:    and w9, w3, #0xff
 ; CHECK-GI-NEXT:    and w11, w1, #0xff
+; CHECK-GI-NEXT:    udiv w10, w8, w9
 ; CHECK-GI-NEXT:    and w12, w4, #0xff
 ; CHECK-GI-NEXT:    and w14, w2, #0xff
 ; CHECK-GI-NEXT:    and w15, w5, #0xff
-; CHECK-GI-NEXT:    udiv w10, w8, w9
 ; CHECK-GI-NEXT:    udiv w13, w11, w12
-; CHECK-GI-NEXT:    msub w0, w10, w9, w8
-; CHECK-GI-NEXT:    udiv w16, w14, w15
-; CHECK-GI-NEXT:    msub w1, w13, w12, w11
-; CHECK-GI-NEXT:    msub w2, w16, w15, w14
+; CHECK-GI-NEXT:    msub w8, w10, w9, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    udiv w9, w14, w15
+; CHECK-GI-NEXT:    msub w8, w13, w12, w11
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    msub w8, w9, w15, w14
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    fmov w1, s1
+; CHECK-GI-NEXT:    fmov w2, s2
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = urem <3 x i8> %d, %e
@@ -2075,12 +2091,16 @@ define <3 x i16> @sv3i16(<3 x i16> %d, <3 x i16> %e) {
 ; CHECK-GI-NEXT:    sdiv w10, w8, w9
 ; CHECK-GI-NEXT:    sdiv w13, w11, w12
 ; CHECK-GI-NEXT:    msub w8, w10, w9, w8
-; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    sdiv w16, w14, w15
-; CHECK-GI-NEXT:    msub w9, w13, w12, w11
-; CHECK-GI-NEXT:    mov v0.h[1], w9
-; CHECK-GI-NEXT:    msub w8, w16, w15, w14
-; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    sdiv w9, w14, w15
+; CHECK-GI-NEXT:    msub w8, w13, w12, w11
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    msub w8, w9, w15, w14
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -2543,12 +2563,16 @@ define <3 x i16> @uv3i16(<3 x i16> %d, <3 x i16> %e) {
 ; CHECK-GI-NEXT:    udiv w10, w8, w9
 ; CHECK-GI-NEXT:    udiv w13, w11, w12
 ; CHECK-GI-NEXT:    msub w8, w10, w9, w8
-; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    udiv w16, w14, w15
-; CHECK-GI-NEXT:    msub w9, w13, w12, w11
-; CHECK-GI-NEXT:    mov v0.h[1], w9
-; CHECK-GI-NEXT:    msub w8, w16, w15, w14
-; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    udiv w9, w14, w15
+; CHECK-GI-NEXT:    msub w8, w13, w12, w11
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    msub w8, w9, w15, w14
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -3003,12 +3027,15 @@ define <3 x i32> @sv3i32(<3 x i32> %d, <3 x i32> %e) {
 ; CHECK-GI-NEXT:    fmov w15, s1
 ; CHECK-GI-NEXT:    sdiv w13, w11, w12
 ; CHECK-GI-NEXT:    msub w8, w10, w9, w8
-; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    mov v1.s[0], w8
 ; CHECK-GI-NEXT:    sdiv w9, w14, w15
 ; CHECK-GI-NEXT:    msub w8, w13, w12, w11
-; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    mov v1.s[1], w8
 ; CHECK-GI-NEXT:    msub w8, w9, w15, w14
-; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <3 x i32> %d, %e
@@ -3234,12 +3261,15 @@ define <3 x i32> @uv3i32(<3 x i32> %d, <3 x i32> %e) {
 ; CHECK-GI-NEXT:    fmov w15, s1
 ; CHECK-GI-NEXT:    udiv w13, w11, w12
 ; CHECK-GI-NEXT:    msub w8, w10, w9, w8
-; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    mov v1.s[0], w8
 ; CHECK-GI-NEXT:    udiv w9, w14, w15
 ; CHECK-GI-NEXT:    msub w8, w13, w12, w11
-; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    mov v1.s[1], w8
 ; CHECK-GI-NEXT:    msub w8, w9, w15, w14
-; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = urem <3 x i32> %d, %e
@@ -3469,25 +3499,26 @@ define <3 x i64> @sv3i64(<3 x i64> %d, <3 x i64> %e) {
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    sdiv x8, x8, x9
 ; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    fmov x11, d3
+; CHECK-GI-NEXT:    fmov x12, d3
 ; CHECK-GI-NEXT:    mov x14, v3.d[1]
 ; CHECK-GI-NEXT:    sdiv x9, x9, x10
 ; CHECK-GI-NEXT:    mov v6.d[0], x8
 ; CHECK-GI-NEXT:    fmov x8, d2
+; CHECK-GI-NEXT:    fmov x10, d5
 ; CHECK-GI-NEXT:    mov v6.d[1], x9
-; CHECK-GI-NEXT:    fmov x9, d5
-; CHECK-GI-NEXT:    sdiv x12, x8, x9
-; CHECK-GI-NEXT:    fmov x10, d6
+; CHECK-GI-NEXT:    sdiv x9, x8, x10
+; CHECK-GI-NEXT:    fmov x11, d6
 ; CHECK-GI-NEXT:    mov x13, v6.d[1]
-; CHECK-GI-NEXT:    mul x10, x10, x11
-; CHECK-GI-NEXT:    mul x11, x13, x14
-; CHECK-GI-NEXT:    mov v2.d[0], x10
-; CHECK-GI-NEXT:    mov v2.d[1], x11
-; CHECK-GI-NEXT:    msub x8, x12, x9, x8
+; CHECK-GI-NEXT:    mul x11, x11, x12
+; CHECK-GI-NEXT:    mul x12, x13, x14
+; CHECK-GI-NEXT:    mov v2.d[0], x11
+; CHECK-GI-NEXT:    mov v2.d[1], x12
+; CHECK-GI-NEXT:    msub x8, x9, x10, x8
 ; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    mov v2.d[0], x8
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT:    fmov d2, x8
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <3 x i64> %d, %e
@@ -3634,25 +3665,26 @@ define <3 x i64> @uv3i64(<3 x i64> %d, <3 x i64> %e) {
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    udiv x8, x8, x9
 ; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    fmov x11, d3
+; CHECK-GI-NEXT:    fmov x12, d3
 ; CHECK-GI-NEXT:    mov x14, v3.d[1]
 ; CHECK-GI-NEXT:    udiv x9, x9, x10
 ; CHECK-GI-NEXT:    mov v6.d[0], x8
 ; CHECK-GI-NEXT:    fmov x8, d2
+; CHECK-GI-NEXT:    fmov x10, d5
 ; CHECK-GI-NEXT:    mov v6.d[1], x9
-; CHECK-GI-NEXT:    fmov x9, d5
-; CHECK-GI-NEXT:    udiv x12, x8, x9
-; CHECK-GI-NEXT:    fmov x10, d6
+; CHECK-GI-NEXT:    udiv x9, x8, x10
+; CHECK-GI-NEXT:    fmov x11, d6
 ; CHECK-GI-NEXT:    mov x13, v6.d[1]
-; CHECK-GI-NEXT:    mul x10, x10, x11
-; CHECK-GI-NEXT:    mul x11, x13, x14
-; CHECK-GI-NEXT:    mov v2.d[0], x10
-; CHECK-GI-NEXT:    mov v2.d[1], x11
-; CHECK-GI-NEXT:    msub x8, x12, x9, x8
+; CHECK-GI-NEXT:    mul x11, x11, x12
+; CHECK-GI-NEXT:    mul x12, x13, x14
+; CHECK-GI-NEXT:    mov v2.d[0], x11
+; CHECK-GI-NEXT:    mov v2.d[1], x12
+; CHECK-GI-NEXT:    msub x8, x9, x10, x8
 ; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    mov v2.d[0], x8
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT:    fmov d2, x8
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = urem <3 x i64> %d, %e
diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll
index 853ed92c91fbcd..ca38f3b701084d 100644
--- a/llvm/test/CodeGen/AArch64/sext.ll
+++ b/llvm/test/CodeGen/AArch64/sext.ll
@@ -219,18 +219,16 @@ define <3 x i16> @sext_v3i8_v3i16(<3 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v3i8_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsl w8, w0, #8
-; CHECK-GI-NEXT:    lsl w9, w1, #8
-; CHECK-GI-NEXT:    lsl w10, w2, #8
-; CHECK-GI-NEXT:    sxth w8, w8
-; CHECK-GI-NEXT:    sxth w9, w9
-; CHECK-GI-NEXT:    asr w8, w8, #8
-; CHECK-GI-NEXT:    asr w9, w9, #8
-; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    sxth w8, w10
-; CHECK-GI-NEXT:    asr w8, w8, #8
-; CHECK-GI-NEXT:    mov v0.h[1], w9
-; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    sxtb w8, w0
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    sxtb w8, w1
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    sxtb w8, w2
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -252,11 +250,14 @@ define <3 x i32> @sext_v3i8_v3i32(<3 x i8> %a) {
 ; CHECK-GI-LABEL: sext_v3i8_v3i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    sxtb w8, w0
-; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    mov v1.s[0], w8
 ; CHECK-GI-NEXT:    sxtb w8, w1
-; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    mov v1.s[1], w8
 ; CHECK-GI-NEXT:    sxtb w8, w2
-; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <3 x i8> %a to <3 x i32>
@@ -284,14 +285,17 @@ define <3 x i64> @sext_v3i8_v3i64(<3 x i8> %a) {
 ; CHECK-GI-LABEL: sext_v3i8_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-GI-NEXT:    sxtb x8, w0
 ; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 def $x2
-; CHECK-GI-NEXT:    sxtb x8, w0
-; CHECK-GI-NEXT:    sxtb x9, w1
-; CHECK-GI-NEXT:    sxtb x10, w2
-; CHECK-GI-NEXT:    fmov d0, x8
-; CHECK-GI-NEXT:    fmov d1, x9
-; CHECK-GI-NEXT:    fmov d2, x10
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    sxtb x8, w1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    sxtb x8, w2
+; CHECK-GI-NEXT:    mov v2.d[0], x8
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <3 x i8> %a to <3 x i64>
@@ -313,7 +317,9 @@ define <3 x i32> @sext_v3i16_v3i32(<3 x i16> %a) {
 ; CHECK-GI-NEXT:    smov w8, v0.h[2]
 ; CHECK-GI-NEXT:    mov v1.s[1], w9
 ; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <3 x i16> %a to <3 x i32>
@@ -337,10 +343,13 @@ define <3 x i64> @sext_v3i16_v3i64(<3 x i16> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    smov x8, v0.h[0]
 ; CHECK-GI-NEXT:    smov x9, v0.h[1]
-; CHECK-GI-NEXT:    smov x10, v0.h[2]
-; CHECK-GI-NEXT:    fmov d0, x8
-; CHECK-GI-NEXT:    fmov d1, x9
-; CHECK-GI-NEXT:    fmov d2, x10
+; CHECK-GI-NEXT:    mov v3.d[0], x8
+; CHECK-GI-NEXT:    smov x8, v0.h[2]
+; CHECK-GI-NEXT:    mov v3.d[1], x9
+; CHECK-GI-NEXT:    mov v2.d[0], x8
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    mov d1, v3.d[1]
+; CHECK-GI-NEXT:    fmov d0, d3
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <3 x i16> %a to <3 x i64>
@@ -362,10 +371,13 @@ define <3 x i64> @sext_v3i32_v3i64(<3 x i32> %a) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    smov x8, v0.s[0]
 ; CHECK-GI-NEXT:    smov x9, v0.s[1]
-; CHECK-GI-NEXT:    smov x10, v0.s[2]
-; CHECK-GI-NEXT:    fmov d0, x8
-; CHECK-GI-NEXT:    fmov d1, x9
-; CHECK-GI-NEXT:    fmov d2, x10
+; CHECK-GI-NEXT:    mov v3.d[0], x8
+; CHECK-GI-NEXT:    smov x8, v0.s[2]
+; CHECK-GI-NEXT:    mov v3.d[1], x9
+; CHECK-GI-NEXT:    mov v2.d[0], x8
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    mov d1, v3.d[1]
+; CHECK-GI-NEXT:    fmov d0, d3
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <3 x i32> %a to <3 x i64>
@@ -384,18 +396,16 @@ define <3 x i16> @sext_v3i10_v3i16(<3 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v3i10_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsl w8, w0, #6
-; CHECK-GI-NEXT:    lsl w9, w1, #6
-; CHECK-GI-NEXT:    lsl w10, w2, #6
-; CHECK-GI-NEXT:    sxth w8, w8
-; CHECK-GI-NEXT:    sxth w9, w9
-; CHECK-GI-NEXT:    asr w8, w8, #6
-; CHECK-GI-NEXT:    asr w9, w9, #6
-; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    sxth w8, w10
-; CHECK-GI-NEXT:    asr w8, w8, #6
-; CHECK-GI-NEXT:    mov v0.h[1], w9
-; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    sbfx w8, w0, #0, #10
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    sbfx w8, w1, #0, #10
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    sbfx w8, w2, #0, #10
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -417,11 +427,14 @@ define <3 x i32> @sext_v3i10_v3i32(<3 x i10> %a) {
 ; CHECK-GI-LABEL: sext_v3i10_v3i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    sbfx w8, w0, #0, #10
-; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    mov v1.s[0], w8
 ; CHECK-GI-NEXT:    sbfx w8, w1, #0, #10
-; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    mov v1.s[1], w8
 ; CHECK-GI-NEXT:    sbfx w8, w2, #0, #10
-; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <3 x i10> %a to <3 x i32>
@@ -449,14 +462,17 @@ define <3 x i64> @sext_v3i10_v3i64(<3 x i10> %a) {
 ; CHECK-GI-LABEL: sext_v3i10_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-GI-NEXT:    sbfx x8, x0, #0, #10
 ; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 def $x2
-; CHECK-GI-NEXT:    sbfx x8, x0, #0, #10
-; CHECK-GI-NEXT:    sbfx x9, x1, #0, #10
-; CHECK-GI-NEXT:    sbfx x10, x2, #0, #10
-; CHECK-GI-NEXT:    fmov d0, x8
-; CHECK-GI-NEXT:    fmov d1, x9
-; CHECK-GI-NEXT:    fmov d2, x10
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    sbfx x8, x1, #0, #10
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    sbfx x8, x2, #0, #10
+; CHECK-GI-NEXT:    mov v2.d[0], x8
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <3 x i10> %a to <3 x i64>
diff --git a/llvm/test/CodeGen/AArch64/shift.ll b/llvm/test/CodeGen/AArch64/shift.ll
index 066928687cc02d..c8344a39da56a7 100644
--- a/llvm/test/CodeGen/AArch64/shift.ll
+++ b/llvm/test/CodeGen/AArch64/shift.ll
@@ -1069,46 +1069,188 @@ define <3 x i8> @shl_v3i8(<3 x i8> %0, <3 x i8> %1){
 ; CHECK-GI-NEXT:    mov v0.b[2], w2
 ; CHECK-GI-NEXT:    mov v1.b[2], w5
 ; CHECK-GI-NEXT:    ushl v0.8b, v0.8b, v1.8b
-; CHECK-GI-NEXT:    umov w0, v0.b[0]
-; CHECK-GI-NEXT:    umov w1, v0.b[1]
-; CHECK-GI-NEXT:    umov w2, v0.b[2]
+; CHECK-GI-NEXT:    umov w8, v0.b[0]
+; CHECK-GI-NEXT:    umov w9, v0.b[1]
+; CHECK-GI-NEXT:    mov v1.s[0], w8
+; CHECK-GI-NEXT:    umov w8, v0.b[2]
+; CHECK-GI-NEXT:    mov v1.s[1], w9
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    mov s0, v1.s[1]
+; CHECK-GI-NEXT:    mov s2, v1.s[2]
+; CHECK-GI-NEXT:    fmov w0, s1
+; CHECK-GI-NEXT:    fmov w1, s0
+; CHECK-GI-NEXT:    fmov w2, s2
 ; CHECK-GI-NEXT:    ret
     %3 = shl <3 x i8> %0, %1
     ret <3 x i8> %3
 }
 
 define <7 x i8> @shl_v7i8(<7 x i8> %0, <7 x i8> %1){
-; CHECK-LABEL: shl_v7i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushl v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shl_v7i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ushl v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_v7i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov b3, v0.b[1]
+; CHECK-GI-NEXT:    mov b4, v1.b[1]
+; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov v5.b[0], v1.b[0]
+; CHECK-GI-NEXT:    mov b6, v0.b[2]
+; CHECK-GI-NEXT:    mov b7, v1.b[2]
+; CHECK-GI-NEXT:    mov v2.b[1], v3.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[3]
+; CHECK-GI-NEXT:    mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[3]
+; CHECK-GI-NEXT:    mov v2.b[2], v6.b[0]
+; CHECK-GI-NEXT:    mov b6, v0.b[4]
+; CHECK-GI-NEXT:    mov v5.b[2], v7.b[0]
+; CHECK-GI-NEXT:    mov b7, v1.b[4]
+; CHECK-GI-NEXT:    mov v2.b[3], v3.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[5]
+; CHECK-GI-NEXT:    mov b0, v0.b[6]
+; CHECK-GI-NEXT:    mov v5.b[3], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[5]
+; CHECK-GI-NEXT:    mov b1, v1.b[6]
+; CHECK-GI-NEXT:    mov v2.b[4], v6.b[0]
+; CHECK-GI-NEXT:    mov v5.b[4], v7.b[0]
+; CHECK-GI-NEXT:    mov v2.b[5], v3.b[0]
+; CHECK-GI-NEXT:    mov v5.b[5], v4.b[0]
+; CHECK-GI-NEXT:    mov v2.b[6], v0.b[0]
+; CHECK-GI-NEXT:    mov v5.b[6], v1.b[0]
+; CHECK-GI-NEXT:    ushl v0.8b, v2.8b, v5.8b
+; CHECK-GI-NEXT:    mov b1, v0.b[1]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b3, v0.b[3]
+; CHECK-GI-NEXT:    mov b4, v0.b[4]
+; CHECK-GI-NEXT:    mov b5, v0.b[6]
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov b1, v0.b[5]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    fmov w8, s3
+; CHECK-GI-NEXT:    mov v0.h[3], w8
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    mov v0.h[4], w8
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.h[5], w8
+; CHECK-GI-NEXT:    fmov w8, s5
+; CHECK-GI-NEXT:    mov v0.h[6], w8
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    mov h2, v0.h[3]
+; CHECK-GI-NEXT:    mov h3, v0.h[4]
+; CHECK-GI-NEXT:    mov h4, v0.h[5]
+; CHECK-GI-NEXT:    mov h5, v0.h[6]
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov h1, v0.h[2]
+; CHECK-GI-NEXT:    mov v0.b[1], w8
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.b[2], w8
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov v0.b[3], w8
+; CHECK-GI-NEXT:    fmov w8, s3
+; CHECK-GI-NEXT:    mov v0.b[4], w8
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    mov v0.b[5], w8
+; CHECK-GI-NEXT:    fmov w8, s5
+; CHECK-GI-NEXT:    mov v0.b[6], w8
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
     %3 = shl <7 x i8> %0, %1
     ret <7 x i8> %3
 }
 
 define <3 x i16> @shl_v3i16(<3 x i16> %0, <3 x i16> %1){
-; CHECK-LABEL: shl_v3i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushl v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shl_v3i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_v3i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-GI-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-GI-NEXT:    ushl v1.4h, v2.4h, v3.4h
+; CHECK-GI-NEXT:    umov w8, v1.h[0]
+; CHECK-GI-NEXT:    umov w9, v1.h[1]
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    umov w8, v1.h[2]
+; CHECK-GI-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
     %3 = shl <3 x i16> %0, %1
     ret <3 x i16> %3
 }
 
 define <7 x i16> @shl_v7i16(<7 x i16> %0, <7 x i16> %1){
-; CHECK-LABEL: shl_v7i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushl v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shl_v7i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ushl v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_v7i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-GI-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-GI-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-GI-NEXT:    mov v2.h[4], v0.h[4]
+; CHECK-GI-NEXT:    mov v3.h[4], v1.h[4]
+; CHECK-GI-NEXT:    mov v2.h[5], v0.h[5]
+; CHECK-GI-NEXT:    mov v3.h[5], v1.h[5]
+; CHECK-GI-NEXT:    mov v2.h[6], v0.h[6]
+; CHECK-GI-NEXT:    mov v3.h[6], v1.h[6]
+; CHECK-GI-NEXT:    ushl v1.8h, v2.8h, v3.8h
+; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT:    ret
     %3 = shl <7 x i16> %0, %1
     ret <7 x i16> %3
 }
 
 define <3 x i32> @shl_v3i32(<3 x i32> %0, <3 x i32> %1){
-; CHECK-LABEL: shl_v3i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushl v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shl_v3i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_v3i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ushl v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
     %3 = shl <3 x i32> %0, %1
     ret <3 x i32> %3
 }
@@ -1142,50 +1284,196 @@ define <3 x i8> @ashr_v3i8(<3 x i8> %0, <3 x i8> %1){
 ; CHECK-GI-NEXT:    mov v1.b[2], w2
 ; CHECK-GI-NEXT:    neg v0.8b, v0.8b
 ; CHECK-GI-NEXT:    sshl v0.8b, v1.8b, v0.8b
-; CHECK-GI-NEXT:    umov w0, v0.b[0]
-; CHECK-GI-NEXT:    umov w1, v0.b[1]
-; CHECK-GI-NEXT:    umov w2, v0.b[2]
+; CHECK-GI-NEXT:    umov w8, v0.b[0]
+; CHECK-GI-NEXT:    umov w9, v0.b[1]
+; CHECK-GI-NEXT:    mov v1.s[0], w8
+; CHECK-GI-NEXT:    umov w8, v0.b[2]
+; CHECK-GI-NEXT:    mov v1.s[1], w9
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    mov s0, v1.s[1]
+; CHECK-GI-NEXT:    mov s2, v1.s[2]
+; CHECK-GI-NEXT:    fmov w0, s1
+; CHECK-GI-NEXT:    fmov w1, s0
+; CHECK-GI-NEXT:    fmov w2, s2
 ; CHECK-GI-NEXT:    ret
     %3 = ashr <3 x i8> %0, %1
     ret <3 x i8> %3
 }
 
 define <7 x i8> @ashr_v7i8(<7 x i8> %0, <7 x i8> %1){
-; CHECK-LABEL: ashr_v7i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg v1.8b, v1.8b
-; CHECK-NEXT:    sshl v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ashr_v7i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    neg v1.8b, v1.8b
+; CHECK-SD-NEXT:    sshl v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ashr_v7i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov b2, v1.b[1]
+; CHECK-GI-NEXT:    mov v3.b[0], v1.b[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov b4, v0.b[1]
+; CHECK-GI-NEXT:    mov v5.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b6, v1.b[2]
+; CHECK-GI-NEXT:    mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[3]
+; CHECK-GI-NEXT:    mov v3.b[2], v6.b[0]
+; CHECK-GI-NEXT:    mov b6, v0.b[3]
+; CHECK-GI-NEXT:    mov v5.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v1.b[4]
+; CHECK-GI-NEXT:    mov v3.b[3], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v0.b[4]
+; CHECK-GI-NEXT:    mov v5.b[3], v6.b[0]
+; CHECK-GI-NEXT:    mov b6, v1.b[5]
+; CHECK-GI-NEXT:    mov b1, v1.b[6]
+; CHECK-GI-NEXT:    mov v3.b[4], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[5]
+; CHECK-GI-NEXT:    mov b0, v0.b[6]
+; CHECK-GI-NEXT:    mov v5.b[4], v4.b[0]
+; CHECK-GI-NEXT:    mov v3.b[5], v6.b[0]
+; CHECK-GI-NEXT:    mov v5.b[5], v2.b[0]
+; CHECK-GI-NEXT:    mov v3.b[6], v1.b[0]
+; CHECK-GI-NEXT:    mov v5.b[6], v0.b[0]
+; CHECK-GI-NEXT:    neg v0.8b, v3.8b
+; CHECK-GI-NEXT:    sshl v0.8b, v5.8b, v0.8b
+; CHECK-GI-NEXT:    mov b1, v0.b[1]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b3, v0.b[3]
+; CHECK-GI-NEXT:    mov b4, v0.b[4]
+; CHECK-GI-NEXT:    mov b5, v0.b[6]
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov b1, v0.b[5]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    fmov w8, s3
+; CHECK-GI-NEXT:    mov v0.h[3], w8
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    mov v0.h[4], w8
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.h[5], w8
+; CHECK-GI-NEXT:    fmov w8, s5
+; CHECK-GI-NEXT:    mov v0.h[6], w8
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    mov h2, v0.h[3]
+; CHECK-GI-NEXT:    mov h3, v0.h[4]
+; CHECK-GI-NEXT:    mov h4, v0.h[5]
+; CHECK-GI-NEXT:    mov h5, v0.h[6]
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov h1, v0.h[2]
+; CHECK-GI-NEXT:    mov v0.b[1], w8
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.b[2], w8
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov v0.b[3], w8
+; CHECK-GI-NEXT:    fmov w8, s3
+; CHECK-GI-NEXT:    mov v0.b[4], w8
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    mov v0.b[5], w8
+; CHECK-GI-NEXT:    fmov w8, s5
+; CHECK-GI-NEXT:    mov v0.b[6], w8
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
     %3 = ashr <7 x i8> %0, %1
     ret <7 x i8> %3
 }
 
 define <3 x i16> @ashr_v3i16(<3 x i16> %0, <3 x i16> %1){
-; CHECK-LABEL: ashr_v3i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg v1.4h, v1.4h
-; CHECK-NEXT:    sshl v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ashr_v3i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    neg v1.4h, v1.4h
+; CHECK-SD-NEXT:    sshl v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ashr_v3i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov v2.h[0], v1.h[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov v3.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v2.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v3.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v2.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v3.h[2], v0.h[2]
+; CHECK-GI-NEXT:    neg v0.4h, v2.4h
+; CHECK-GI-NEXT:    sshl v1.4h, v3.4h, v0.4h
+; CHECK-GI-NEXT:    umov w8, v1.h[0]
+; CHECK-GI-NEXT:    umov w9, v1.h[1]
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    umov w8, v1.h[2]
+; CHECK-GI-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
     %3 = ashr <3 x i16> %0, %1
     ret <3 x i16> %3
 }
 
 define <7 x i16> @ashr_v7i16(<7 x i16> %0, <7 x i16> %1){
-; CHECK-LABEL: ashr_v7i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg v1.8h, v1.8h
-; CHECK-NEXT:    sshl v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ashr_v7i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    neg v1.8h, v1.8h
+; CHECK-SD-NEXT:    sshl v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ashr_v7i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov v2.h[0], v1.h[0]
+; CHECK-GI-NEXT:    mov v3.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v2.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v3.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v2.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v3.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v2.h[3], v1.h[3]
+; CHECK-GI-NEXT:    mov v3.h[3], v0.h[3]
+; CHECK-GI-NEXT:    mov v2.h[4], v1.h[4]
+; CHECK-GI-NEXT:    mov v3.h[4], v0.h[4]
+; CHECK-GI-NEXT:    mov v2.h[5], v1.h[5]
+; CHECK-GI-NEXT:    mov v3.h[5], v0.h[5]
+; CHECK-GI-NEXT:    mov v2.h[6], v1.h[6]
+; CHECK-GI-NEXT:    mov v3.h[6], v0.h[6]
+; CHECK-GI-NEXT:    neg v0.8h, v2.8h
+; CHECK-GI-NEXT:    sshl v1.8h, v3.8h, v0.8h
+; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT:    ret
     %3 = ashr <7 x i16> %0, %1
     ret <7 x i16> %3
 }
 
 define <3 x i32> @ashr_v3i32(<3 x i32> %0, <3 x i32> %1){
-; CHECK-LABEL: ashr_v3i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg v1.4s, v1.4s
-; CHECK-NEXT:    sshl v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ashr_v3i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    neg v1.4s, v1.4s
+; CHECK-SD-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ashr_v3i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov v2.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v3.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v3.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v2.s[2], v1.s[2]
+; CHECK-GI-NEXT:    mov v3.s[2], v0.s[2]
+; CHECK-GI-NEXT:    neg v0.4s, v2.4s
+; CHECK-GI-NEXT:    sshl v1.4s, v3.4s, v0.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
     %3 = ashr <3 x i32> %0, %1
     ret <3 x i32> %3
 }
@@ -1218,50 +1506,196 @@ define <3 x i8> @lshr_v3i8(<3 x i8> %0, <3 x i8> %1){
 ; CHECK-GI-NEXT:    mov v1.b[2], w2
 ; CHECK-GI-NEXT:    neg v0.8b, v0.8b
 ; CHECK-GI-NEXT:    ushl v0.8b, v1.8b, v0.8b
-; CHECK-GI-NEXT:    umov w0, v0.b[0]
-; CHECK-GI-NEXT:    umov w1, v0.b[1]
-; CHECK-GI-NEXT:    umov w2, v0.b[2]
+; CHECK-GI-NEXT:    umov w8, v0.b[0]
+; CHECK-GI-NEXT:    umov w9, v0.b[1]
+; CHECK-GI-NEXT:    mov v1.s[0], w8
+; CHECK-GI-NEXT:    umov w8, v0.b[2]
+; CHECK-GI-NEXT:    mov v1.s[1], w9
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    mov s0, v1.s[1]
+; CHECK-GI-NEXT:    mov s2, v1.s[2]
+; CHECK-GI-NEXT:    fmov w0, s1
+; CHECK-GI-NEXT:    fmov w1, s0
+; CHECK-GI-NEXT:    fmov w2, s2
 ; CHECK-GI-NEXT:    ret
     %3 = lshr <3 x i8> %0, %1
     ret <3 x i8> %3
 }
 
 define <7 x i8> @lshr_v7i8(<7 x i8> %0, <7 x i8> %1){
-; CHECK-LABEL: lshr_v7i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg v1.8b, v1.8b
-; CHECK-NEXT:    ushl v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: lshr_v7i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    neg v1.8b, v1.8b
+; CHECK-SD-NEXT:    ushl v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: lshr_v7i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov b2, v1.b[1]
+; CHECK-GI-NEXT:    mov v3.b[0], v1.b[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov b4, v0.b[1]
+; CHECK-GI-NEXT:    mov v5.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b6, v1.b[2]
+; CHECK-GI-NEXT:    mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[3]
+; CHECK-GI-NEXT:    mov v3.b[2], v6.b[0]
+; CHECK-GI-NEXT:    mov b6, v0.b[3]
+; CHECK-GI-NEXT:    mov v5.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v1.b[4]
+; CHECK-GI-NEXT:    mov v3.b[3], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v0.b[4]
+; CHECK-GI-NEXT:    mov v5.b[3], v6.b[0]
+; CHECK-GI-NEXT:    mov b6, v1.b[5]
+; CHECK-GI-NEXT:    mov b1, v1.b[6]
+; CHECK-GI-NEXT:    mov v3.b[4], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[5]
+; CHECK-GI-NEXT:    mov b0, v0.b[6]
+; CHECK-GI-NEXT:    mov v5.b[4], v4.b[0]
+; CHECK-GI-NEXT:    mov v3.b[5], v6.b[0]
+; CHECK-GI-NEXT:    mov v5.b[5], v2.b[0]
+; CHECK-GI-NEXT:    mov v3.b[6], v1.b[0]
+; CHECK-GI-NEXT:    mov v5.b[6], v0.b[0]
+; CHECK-GI-NEXT:    neg v0.8b, v3.8b
+; CHECK-GI-NEXT:    ushl v0.8b, v5.8b, v0.8b
+; CHECK-GI-NEXT:    mov b1, v0.b[1]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b3, v0.b[3]
+; CHECK-GI-NEXT:    mov b4, v0.b[4]
+; CHECK-GI-NEXT:    mov b5, v0.b[6]
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov b1, v0.b[5]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    fmov w8, s3
+; CHECK-GI-NEXT:    mov v0.h[3], w8
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    mov v0.h[4], w8
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.h[5], w8
+; CHECK-GI-NEXT:    fmov w8, s5
+; CHECK-GI-NEXT:    mov v0.h[6], w8
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    mov h2, v0.h[3]
+; CHECK-GI-NEXT:    mov h3, v0.h[4]
+; CHECK-GI-NEXT:    mov h4, v0.h[5]
+; CHECK-GI-NEXT:    mov h5, v0.h[6]
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov h1, v0.h[2]
+; CHECK-GI-NEXT:    mov v0.b[1], w8
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.b[2], w8
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov v0.b[3], w8
+; CHECK-GI-NEXT:    fmov w8, s3
+; CHECK-GI-NEXT:    mov v0.b[4], w8
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    mov v0.b[5], w8
+; CHECK-GI-NEXT:    fmov w8, s5
+; CHECK-GI-NEXT:    mov v0.b[6], w8
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
     %3 = lshr <7 x i8> %0, %1
     ret <7 x i8> %3
 }
 
 define <3 x i16> @lshr_v3i16(<3 x i16> %0, <3 x i16> %1){
-; CHECK-LABEL: lshr_v3i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg v1.4h, v1.4h
-; CHECK-NEXT:    ushl v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: lshr_v3i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    neg v1.4h, v1.4h
+; CHECK-SD-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: lshr_v3i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov v2.h[0], v1.h[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov v3.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v2.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v3.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v2.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v3.h[2], v0.h[2]
+; CHECK-GI-NEXT:    neg v0.4h, v2.4h
+; CHECK-GI-NEXT:    ushl v1.4h, v3.4h, v0.4h
+; CHECK-GI-NEXT:    umov w8, v1.h[0]
+; CHECK-GI-NEXT:    umov w9, v1.h[1]
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    umov w8, v1.h[2]
+; CHECK-GI-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
     %3 = lshr <3 x i16> %0, %1
     ret <3 x i16> %3
 }
 
 define <7 x i16> @lshr_v7i16(<7 x i16> %0, <7 x i16> %1){
-; CHECK-LABEL: lshr_v7i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg v1.8h, v1.8h
-; CHECK-NEXT:    ushl v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: lshr_v7i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    neg v1.8h, v1.8h
+; CHECK-SD-NEXT:    ushl v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: lshr_v7i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov v2.h[0], v1.h[0]
+; CHECK-GI-NEXT:    mov v3.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v2.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v3.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v2.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v3.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v2.h[3], v1.h[3]
+; CHECK-GI-NEXT:    mov v3.h[3], v0.h[3]
+; CHECK-GI-NEXT:    mov v2.h[4], v1.h[4]
+; CHECK-GI-NEXT:    mov v3.h[4], v0.h[4]
+; CHECK-GI-NEXT:    mov v2.h[5], v1.h[5]
+; CHECK-GI-NEXT:    mov v3.h[5], v0.h[5]
+; CHECK-GI-NEXT:    mov v2.h[6], v1.h[6]
+; CHECK-GI-NEXT:    mov v3.h[6], v0.h[6]
+; CHECK-GI-NEXT:    neg v0.8h, v2.8h
+; CHECK-GI-NEXT:    ushl v1.8h, v3.8h, v0.8h
+; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT:    ret
     %3 = lshr <7 x i16> %0, %1
     ret <7 x i16> %3
 }
 
 define <3 x i32> @lshr_v3i32(<3 x i32> %0, <3 x i32> %1){
-; CHECK-LABEL: lshr_v3i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg v1.4s, v1.4s
-; CHECK-NEXT:    ushl v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: lshr_v3i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    neg v1.4s, v1.4s
+; CHECK-SD-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: lshr_v3i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov v2.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v3.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v3.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v2.s[2], v1.s[2]
+; CHECK-GI-NEXT:    mov v3.s[2], v0.s[2]
+; CHECK-GI-NEXT:    neg v0.4s, v2.4s
+; CHECK-GI-NEXT:    ushl v1.4s, v3.4s, v0.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
     %3 = lshr <3 x i32> %0, %1
     ret <3 x i32> %3
 }
diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll
index 6b5951551c3a54..db0fd4293e084b 100644
--- a/llvm/test/CodeGen/AArch64/shufflevector.ll
+++ b/llvm/test/CodeGen/AArch64/shufflevector.ll
@@ -322,10 +322,17 @@ define <16 x i16> @shufflevector_v16i16(<16 x i16> %a, <16 x i16> %b){
 }
 
 define <1 x i32> @shufflevector_v1i32(<1 x i32> %a, <1 x i32> %b) {
-; CHECK-LABEL: shufflevector_v1i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov d0, d1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shufflevector_v1i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov d0, d1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shufflevector_v1i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
     %c = shufflevector <1 x i32> %a, <1 x i32> %b, <1 x i32> <i32 1>
     ret <1 x i32> %c
 }
@@ -464,9 +471,16 @@ define <16 x i16> @shufflevector_v16i16_zeroes(<16 x i16> %a, <16 x i16> %b){
 }
 
 define <1 x i32> @shufflevector_v1i32_zeroes(<1 x i32> %a, <1 x i32> %b) {
-; CHECK-LABEL: shufflevector_v1i32_zeroes:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shufflevector_v1i32_zeroes:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shufflevector_v1i32_zeroes:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov v0.s[0], v0.s[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
     %c = shufflevector <1 x i32> %a, <1 x i32> %b, <1 x i32> <i32 0>
     ret <1 x i32> %c
 }
@@ -503,19 +517,14 @@ define <3 x i8> @shufflevector_v3i8(<3 x i8> %a, <3 x i8> %b) {
 ;
 ; CHECK-GI-LABEL: shufflevector_v3i8:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w3
-; CHECK-GI-NEXT:    adrp x8, .LCPI30_0
-; CHECK-GI-NEXT:    mov v0.b[1], w1
-; CHECK-GI-NEXT:    mov v1.b[1], w4
-; CHECK-GI-NEXT:    mov v0.b[2], w2
-; CHECK-GI-NEXT:    mov v1.b[2], w5
-; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI30_0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
-; CHECK-GI-NEXT:    umov w0, v0.b[0]
-; CHECK-GI-NEXT:    umov w1, v0.b[1]
-; CHECK-GI-NEXT:    umov w2, v0.b[2]
+; CHECK-GI-NEXT:    mov v0.s[0], w0
+; CHECK-GI-NEXT:    mov v0.s[1], w1
+; CHECK-GI-NEXT:    mov v0.s[2], w2
+; CHECK-GI-NEXT:    mov w2, w4
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    mov s0, v0.s[2]
+; CHECK-GI-NEXT:    fmov w0, s1
+; CHECK-GI-NEXT:    fmov w1, s0
 ; CHECK-GI-NEXT:    ret
     %c = shufflevector <3 x i8> %a, <3 x i8> %b, <3 x i32> <i32 1, i32 2, i32 4>
     ret <3 x i8> %c
@@ -535,12 +544,62 @@ define <7 x i8> @shufflevector_v7i8(<7 x i8> %a, <7 x i8> %b) {
 ; CHECK-GI-LABEL: shufflevector_v7i8:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov b2, v0.b[1]
+; CHECK-GI-NEXT:    mov b3, v0.b[2]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    adrp x8, .LCPI31_0
-; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI31_0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    mov b4, v0.b[3]
+; CHECK-GI-NEXT:    mov b5, v0.b[4]
+; CHECK-GI-NEXT:    mov b6, v0.b[5]
+; CHECK-GI-NEXT:    mov b7, v1.b[3]
+; CHECK-GI-NEXT:    mov b16, v1.b[4]
+; CHECK-GI-NEXT:    mov b17, v1.b[5]
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov b2, v0.b[6]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    fmov w8, s3
+; CHECK-GI-NEXT:    mov b3, v1.b[1]
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    mov b4, v1.b[2]
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    mov b3, v1.b[6]
+; CHECK-GI-NEXT:    mov v0.h[3], w8
+; CHECK-GI-NEXT:    fmov w8, s5
+; CHECK-GI-NEXT:    mov v1.h[1], w9
+; CHECK-GI-NEXT:    fmov w9, s4
+; CHECK-GI-NEXT:    mov v0.h[4], w8
+; CHECK-GI-NEXT:    fmov w8, s6
+; CHECK-GI-NEXT:    mov v1.h[2], w9
+; CHECK-GI-NEXT:    fmov w9, s7
+; CHECK-GI-NEXT:    mov v0.h[5], w8
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov v1.h[3], w9
+; CHECK-GI-NEXT:    mov v0.h[6], w8
+; CHECK-GI-NEXT:    fmov w8, s16
+; CHECK-GI-NEXT:    mov v1.h[4], w8
+; CHECK-GI-NEXT:    fmov w8, s17
+; CHECK-GI-NEXT:    mov h4, v0.h[3]
+; CHECK-GI-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NEXT:    mov h0, v0.h[5]
+; CHECK-GI-NEXT:    mov v1.h[5], w8
+; CHECK-GI-NEXT:    fmov w8, s3
+; CHECK-GI-NEXT:    fmov w9, s4
+; CHECK-GI-NEXT:    mov v2.b[1], w9
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov v1.h[6], w8
+; CHECK-GI-NEXT:    mov v2.b[2], w9
+; CHECK-GI-NEXT:    mov h0, v1.h[1]
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov h3, v1.h[3]
+; CHECK-GI-NEXT:    mov v2.b[3], w8
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    mov h0, v1.h[5]
+; CHECK-GI-NEXT:    mov v2.b[4], w8
+; CHECK-GI-NEXT:    fmov w8, s3
+; CHECK-GI-NEXT:    mov v2.b[5], w8
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    mov v2.b[6], w8
+; CHECK-GI-NEXT:    fmov d0, d2
 ; CHECK-GI-NEXT:    ret
     %c = shufflevector <7 x i8> %a, <7 x i8> %b, <7 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12>
     ret <7 x i8> %c
@@ -556,11 +615,18 @@ define <3 x i16> @shufflevector_v3i16(<3 x i16> %a, <3 x i16> %b) {
 ; CHECK-GI-LABEL: shufflevector_v3i16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[1]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    adrp x8, .LCPI32_0
-; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI32_0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-GI-NEXT:    mov v2.s[0], w8
+; CHECK-GI-NEXT:    umov w8, v0.h[2]
+; CHECK-GI-NEXT:    mov v2.s[1], w9
+; CHECK-GI-NEXT:    mov v2.s[2], w8
+; CHECK-GI-NEXT:    mov w8, v2.s[1]
+; CHECK-GI-NEXT:    mov w9, v2.s[2]
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.h[1], w9
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %c = shufflevector <3 x i16> %a, <3 x i16> %b, <3 x i32> <i32 1, i32 2, i32 4>
@@ -579,11 +645,27 @@ define <7 x i16> @shufflevector_v7i16(<7 x i16> %a, <7 x i16> %b) {
 ;
 ; CHECK-GI-LABEL: shufflevector_v7i16:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI33_0
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI33_0]
-; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-GI-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-GI-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-GI-NEXT:    mov v2.h[4], v0.h[4]
+; CHECK-GI-NEXT:    mov v3.h[4], v1.h[4]
+; CHECK-GI-NEXT:    mov v2.h[5], v0.h[5]
+; CHECK-GI-NEXT:    mov v3.h[5], v1.h[5]
+; CHECK-GI-NEXT:    mov v2.h[6], v0.h[6]
+; CHECK-GI-NEXT:    mov v3.h[6], v1.h[6]
+; CHECK-GI-NEXT:    mov v0.h[0], v2.h[1]
+; CHECK-GI-NEXT:    mov v0.h[1], v2.h[3]
+; CHECK-GI-NEXT:    mov v0.h[2], v2.h[5]
+; CHECK-GI-NEXT:    mov v0.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v0.h[4], v3.h[1]
+; CHECK-GI-NEXT:    mov v0.h[5], v3.h[3]
+; CHECK-GI-NEXT:    mov v0.h[6], v3.h[5]
 ; CHECK-GI-NEXT:    ret
     %c = shufflevector <7 x i16> %a, <7 x i16> %b, <7 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12>
     ret <7 x i16> %c
@@ -598,11 +680,12 @@ define <3 x i32> @shufflevector_v3i32(<3 x i32> %a, <3 x i32> %b) {
 ;
 ; CHECK-GI-LABEL: shufflevector_v3i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI34_0
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI34_0]
-; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v0.s[0], v2.s[1]
+; CHECK-GI-NEXT:    mov v0.s[1], v2.s[2]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[1]
 ; CHECK-GI-NEXT:    ret
     %c = shufflevector <3 x i32> %a, <3 x i32> %b, <3 x i32> <i32 1, i32 2, i32 4>
     ret <3 x i32> %c
@@ -619,52 +702,130 @@ define <3 x i8> @shufflevector_v3i8_zeroes(<3 x i8> %a, <3 x i8> %b) {
 ;
 ; CHECK-GI-LABEL: shufflevector_v3i8_zeroes:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    mov v0.b[1], w1
-; CHECK-GI-NEXT:    mov v0.b[2], w2
-; CHECK-GI-NEXT:    dup v0.8b, v0.b[0]
-; CHECK-GI-NEXT:    umov w0, v0.b[0]
-; CHECK-GI-NEXT:    umov w1, v0.b[1]
-; CHECK-GI-NEXT:    umov w2, v0.b[2]
+; CHECK-GI-NEXT:    mov v0.s[0], w0
+; CHECK-GI-NEXT:    mov v0.s[1], w1
+; CHECK-GI-NEXT:    mov v0.s[2], w2
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    fmov w1, s0
+; CHECK-GI-NEXT:    fmov w2, s0
 ; CHECK-GI-NEXT:    ret
     %c = shufflevector <3 x i8> %a, <3 x i8> %b, <3 x i32> <i32 0, i32 0, i32 0>
     ret <3 x i8> %c
 }
 
 define <7 x i8> @shufflevector_v7i8_zeroes(<7 x i8> %a, <7 x i8> %b) {
-; CHECK-LABEL: shufflevector_v7i8_zeroes:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    dup v0.8b, v0.b[0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shufflevector_v7i8_zeroes:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    dup v0.8b, v0.b[0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shufflevector_v7i8_zeroes:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov b1, v0.b[1]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b3, v0.b[3]
+; CHECK-GI-NEXT:    mov b4, v0.b[4]
+; CHECK-GI-NEXT:    mov b5, v0.b[6]
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov b1, v0.b[5]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    fmov w8, s3
+; CHECK-GI-NEXT:    mov v0.h[3], w8
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    mov v0.h[4], w8
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.h[5], w8
+; CHECK-GI-NEXT:    fmov w8, s5
+; CHECK-GI-NEXT:    mov v0.h[6], w8
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    fmov w10, s0
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    fmov w12, s0
+; CHECK-GI-NEXT:    fmov w13, s0
+; CHECK-GI-NEXT:    mov v0.b[1], w8
+; CHECK-GI-NEXT:    mov v0.b[2], w9
+; CHECK-GI-NEXT:    mov v0.b[3], w10
+; CHECK-GI-NEXT:    mov v0.b[4], w11
+; CHECK-GI-NEXT:    mov v0.b[5], w12
+; CHECK-GI-NEXT:    mov v0.b[6], w13
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
     %c = shufflevector <7 x i8> %a, <7 x i8> %b, <7 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
     ret <7 x i8> %c
 }
 
 define <3 x i16> @shufflevector_v3i16_zeroes(<3 x i16> %a, <3 x i16> %b) {
-; CHECK-LABEL: shufflevector_v3i16_zeroes:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    dup v0.4h, v0.h[0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shufflevector_v3i16_zeroes:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shufflevector_v3i16_zeroes:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[1]
+; CHECK-GI-NEXT:    mov v1.s[0], w8
+; CHECK-GI-NEXT:    umov w8, v0.h[2]
+; CHECK-GI-NEXT:    mov v1.s[1], w9
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
     %c = shufflevector <3 x i16> %a, <3 x i16> %b, <3 x i32> <i32 0, i32 0, i32 0>
     ret <3 x i16> %c
 }
 
 define <7 x i16> @shufflevector_v7i16_zeroes(<7 x i16> %a, <7 x i16> %b) {
-; CHECK-LABEL: shufflevector_v7i16_zeroes:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v0.8h, v0.h[0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shufflevector_v7i16_zeroes:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shufflevector_v7i16_zeroes:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT:    mov v1.h[4], v0.h[4]
+; CHECK-GI-NEXT:    mov v1.h[5], v0.h[5]
+; CHECK-GI-NEXT:    mov v1.h[6], v0.h[6]
+; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[3], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[4], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[5], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NEXT:    ret
     %c = shufflevector <7 x i16> %a, <7 x i16> %b, <7 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
     ret <7 x i16> %c
 }
 
 define <3 x i32> @shufflevector_v3i32_zeroes(<3 x i32> %a, <3 x i32> %b) {
-; CHECK-LABEL: shufflevector_v3i32_zeroes:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v0.4s, v0.s[0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shufflevector_v3i32_zeroes:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shufflevector_v3i32_zeroes:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    ret
     %c = shufflevector <3 x i32> %a, <3 x i32> %b, <3 x i32> <i32 0, i32 0, i32 0>
     ret <3 x i32> %c
 }
diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll
index 8f35a69f52b85b..8cd1bcfb82dcc3 100644
--- a/llvm/test/CodeGen/AArch64/sub.ll
+++ b/llvm/test/CodeGen/AArch64/sub.ll
@@ -343,10 +343,24 @@ entry:
 }
 
 define <3 x i32> @v3i32(<3 x i32> %d, <3 x i32> %e) {
-; CHECK-LABEL: v3i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v3i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v3i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT:    sub v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT:    ret
 entry:
   %s = sub <3 x i32> %d, %e
   ret <3 x i32> %s
@@ -408,8 +422,9 @@ define <3 x i64> @v3i64(<3 x i64> %d, <3 x i64> %e) {
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
 ; CHECK-GI-NEXT:    sub x8, x8, x9
-; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    mov v2.d[0], x8
 ; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v3.2d
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
index d71aed2d17506b..69fd0ad01b7c5b 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
@@ -187,12 +187,22 @@ define i8 @test_v9i8(<9 x i8> %a) nounwind {
 }
 
 define i32 @test_v3i32(<3 x i32> %a) nounwind {
-; CHECK-LABEL: test_v3i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov v0.s[3], wzr
-; CHECK-NEXT:    umaxv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_v3i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov v0.s[3], wzr
+; CHECK-SD-NEXT:    umaxv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_v3i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT:    mov v1.s[3], wzr
+; CHECK-GI-NEXT:    umaxv s0, v1.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
   %b = call i32 @llvm.vector.reduce.umax.v3i32(<3 x i32> %a)
   ret i32 %b
 }
diff --git a/llvm/test/CodeGen/AArch64/xtn.ll b/llvm/test/CodeGen/AArch64/xtn.ll
index 8a4d6b8c7b789f..96474a84ca9924 100644
--- a/llvm/test/CodeGen/AArch64/xtn.ll
+++ b/llvm/test/CodeGen/AArch64/xtn.ll
@@ -293,10 +293,19 @@ entry:
 }
 
 define <3 x i16> @xtn_v3i32_v3i16(<3 x i32> %a) {
-; CHECK-LABEL: xtn_v3i32_v3i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: xtn_v3i32_v3i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: xtn_v3i32_v3i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = trunc <3 x i32> %a to <3 x i16>
   ret <3 x i16> %arg1
diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll
index 0d5010113ce0b2..2e979bb1225601 100644
--- a/llvm/test/CodeGen/AArch64/zext.ll
+++ b/llvm/test/CodeGen/AArch64/zext.ll
@@ -243,11 +243,15 @@ define <3 x i16> @zext_v3i8_v3i16(<3 x i8> %a) {
 ; CHECK-GI-LABEL: zext_v3i8_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    and w8, w0, #0xff
-; CHECK-GI-NEXT:    and w9, w1, #0xff
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    and w8, w1, #0xff
+; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    and w8, w2, #0xff
-; CHECK-GI-NEXT:    mov v0.h[1], w9
-; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -269,11 +273,14 @@ define <3 x i32> @zext_v3i8_v3i32(<3 x i8> %a) {
 ; CHECK-GI-LABEL: zext_v3i8_v3i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    and w8, w0, #0xff
-; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    mov v1.s[0], w8
 ; CHECK-GI-NEXT:    and w8, w1, #0xff
-; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    mov v1.s[1], w8
 ; CHECK-GI-NEXT:    and w8, w2, #0xff
-; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <3 x i8> %a to <3 x i32>
@@ -301,14 +308,17 @@ define <3 x i64> @zext_v3i8_v3i64(<3 x i8> %a) {
 ; CHECK-GI-LABEL: zext_v3i8_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-GI-NEXT:    and x8, x0, #0xff
 ; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 def $x2
-; CHECK-GI-NEXT:    and x8, x0, #0xff
-; CHECK-GI-NEXT:    and x9, x1, #0xff
-; CHECK-GI-NEXT:    and x10, x2, #0xff
-; CHECK-GI-NEXT:    fmov d0, x8
-; CHECK-GI-NEXT:    fmov d1, x9
-; CHECK-GI-NEXT:    fmov d2, x10
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    and x8, x1, #0xff
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    and x8, x2, #0xff
+; CHECK-GI-NEXT:    mov v2.d[0], x8
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <3 x i8> %a to <3 x i64>
@@ -330,7 +340,9 @@ define <3 x i32> @zext_v3i16_v3i32(<3 x i16> %a) {
 ; CHECK-GI-NEXT:    umov w8, v0.h[2]
 ; CHECK-GI-NEXT:    mov v1.s[1], w9
 ; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <3 x i16> %a to <3 x i32>
@@ -354,10 +366,13 @@ define <3 x i64> @zext_v3i16_v3i64(<3 x i16> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    umov w8, v0.h[0]
 ; CHECK-GI-NEXT:    umov w9, v0.h[1]
-; CHECK-GI-NEXT:    umov w10, v0.h[2]
-; CHECK-GI-NEXT:    fmov d0, x8
-; CHECK-GI-NEXT:    fmov d1, x9
-; CHECK-GI-NEXT:    fmov d2, x10
+; CHECK-GI-NEXT:    mov v3.d[0], x8
+; CHECK-GI-NEXT:    umov w8, v0.h[2]
+; CHECK-GI-NEXT:    mov v3.d[1], x9
+; CHECK-GI-NEXT:    mov v2.d[0], x8
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    mov d1, v3.d[1]
+; CHECK-GI-NEXT:    fmov d0, d3
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <3 x i16> %a to <3 x i64>
@@ -379,10 +394,13 @@ define <3 x i64> @zext_v3i32_v3i64(<3 x i32> %a) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov w8, v0.s[0]
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
-; CHECK-GI-NEXT:    mov w10, v0.s[2]
-; CHECK-GI-NEXT:    fmov d0, x8
-; CHECK-GI-NEXT:    fmov d1, x9
-; CHECK-GI-NEXT:    fmov d2, x10
+; CHECK-GI-NEXT:    mov v3.d[0], x8
+; CHECK-GI-NEXT:    mov w8, v0.s[2]
+; CHECK-GI-NEXT:    mov v3.d[1], x9
+; CHECK-GI-NEXT:    mov v2.d[0], x8
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    mov d1, v3.d[1]
+; CHECK-GI-NEXT:    fmov d0, d3
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <3 x i32> %a to <3 x i64>
@@ -402,11 +420,15 @@ define <3 x i16> @zext_v3i10_v3i16(<3 x i10> %a) {
 ; CHECK-GI-LABEL: zext_v3i10_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    and w8, w0, #0x3ff
-; CHECK-GI-NEXT:    and w9, w1, #0x3ff
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    and w8, w1, #0x3ff
+; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    and w8, w2, #0x3ff
-; CHECK-GI-NEXT:    mov v0.h[1], w9
-; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -428,11 +450,14 @@ define <3 x i32> @zext_v3i10_v3i32(<3 x i10> %a) {
 ; CHECK-GI-LABEL: zext_v3i10_v3i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    and w8, w0, #0x3ff
-; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    mov v1.s[0], w8
 ; CHECK-GI-NEXT:    and w8, w1, #0x3ff
-; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    mov v1.s[1], w8
 ; CHECK-GI-NEXT:    and w8, w2, #0x3ff
-; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <3 x i10> %a to <3 x i32>
@@ -459,14 +484,17 @@ define <3 x i64> @zext_v3i10_v3i64(<3 x i10> %a) {
 ; CHECK-GI-LABEL: zext_v3i10_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-GI-NEXT:    and x8, x0, #0x3ff
 ; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 def $x2
-; CHECK-GI-NEXT:    and x8, x0, #0x3ff
-; CHECK-GI-NEXT:    and x9, x1, #0x3ff
-; CHECK-GI-NEXT:    and x10, x2, #0x3ff
-; CHECK-GI-NEXT:    fmov d0, x8
-; CHECK-GI-NEXT:    fmov d1, x9
-; CHECK-GI-NEXT:    fmov d2, x10
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    and x8, x1, #0x3ff
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    and x8, x2, #0x3ff
+; CHECK-GI-NEXT:    mov v2.d[0], x8
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <3 x i10> %a to <3 x i64>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
index c8b82716a9fe13..74f259d7cd4cca 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
@@ -9,8 +9,13 @@ define <2 x i16> @v_add_v2i16(<2 x i16> %a, <2 x i16> %b) {
 ; GFX7-LABEL: v_add_v2i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_add_v2i16:
@@ -45,8 +50,13 @@ define <2 x i16> @v_add_v2i16_fneg_lhs(<2 x half> %a, <2 x i16> %b) {
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_add_v2i16_fneg_lhs:
@@ -84,8 +94,13 @@ define <2 x i16> @v_add_v2i16_fneg_rhs(<2 x i16> %a, <2 x half> %b) {
 ; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX7-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_add_v2i16_fneg_rhs:
@@ -130,6 +145,11 @@ define <2 x i16> @v_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v2, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs:
@@ -165,8 +185,13 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) {
 ; GFX7-LABEL: v_add_v2i16_neg_inline_imm_splat:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffc0, v0
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 0xffffffc0, v1
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffc0, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_splat:
@@ -197,8 +222,13 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_lo(<2 x i16> %a) {
 ; GFX7-LABEL: v_add_v2i16_neg_inline_imm_lo:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffc0, v0
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 4, v1
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffc0, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_lo:
@@ -230,8 +260,13 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) {
 ; GFX7-LABEL: v_add_v2i16_neg_inline_imm_hi:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v0
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 0xffffffc0, v1
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_hi:
@@ -614,6 +649,11 @@ define <2 x i16> @add_inline_imm_neg1_0(<2 x i16> %x) {
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_inline_imm_neg1_0:
@@ -645,6 +685,11 @@ define <2 x i16> @add_inline_imm_1_0(<2 x i16> %x) {
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_inline_imm_1_0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index 63f5464371cc62..aba7ded8fe17f1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -753,6 +753,11 @@ define <2 x i16> @v_ashr_v2i16(<2 x i16> %value, <2 x i16> %amount) {
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, v2, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ashr_v2i16:
@@ -782,10 +787,15 @@ define <2 x i16> @v_ashr_v2i16_15(<2 x i16> %value) {
 ; GFX6-LABEL: v_ashr_v2i16_15:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 15, v0
+; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 15, v1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 15, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ashr_v2i16_15:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
index 132dc876b3b054..b026fdb755c00f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
@@ -566,6 +566,11 @@ define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) {
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v1
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 8, 8
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_bswap_v2i16:
@@ -609,6 +614,10 @@ define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) {
 ; GFX8-LABEL: v_bswap_v3i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-NEXT:    s_mov_b32 s4, 0x2030001
 ; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s4
 ; GFX8-NEXT:    v_perm_b32 v1, 0, v1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir
index 42e53bedb8d857..26e8fe2c9a27c1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir
@@ -838,11 +838,18 @@ body:             |
     ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
     ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = reassoc G_FADD [[FMUL]], [[BUILD_VECTOR2]]
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
-    ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C]](s32)
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C1]](s32)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX9-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C2]](s32)
+    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; GFX9-NEXT: [[EVEC3:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C3]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+    ; GFX9-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+    ; GFX9-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
+    ; GFX9-NEXT: $vgpr3 = COPY [[EVEC3]](s32)
     ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     ;
     ; GFX9-CONTRACT-LABEL: name: test_4xfloat_add_mul
@@ -864,11 +871,18 @@ body:             |
     ; GFX9-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
     ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
     ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
-    ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX9-CONTRACT-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9-CONTRACT-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C]](s32)
+    ; GFX9-CONTRACT-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-CONTRACT-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C1]](s32)
+    ; GFX9-CONTRACT-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX9-CONTRACT-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C2]](s32)
+    ; GFX9-CONTRACT-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; GFX9-CONTRACT-NEXT: [[EVEC3:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C3]](s32)
+    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+    ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+    ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
+    ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[EVEC3]](s32)
     ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     ;
     ; GFX9-DENORM-LABEL: name: test_4xfloat_add_mul
@@ -891,11 +905,18 @@ body:             |
     ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
     ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = reassoc G_FADD [[FMUL]], [[BUILD_VECTOR2]]
-    ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
-    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX9-DENORM-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9-DENORM-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C]](s32)
+    ; GFX9-DENORM-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-DENORM-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C1]](s32)
+    ; GFX9-DENORM-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX9-DENORM-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C2]](s32)
+    ; GFX9-DENORM-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; GFX9-DENORM-NEXT: [[EVEC3:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C3]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[EVEC3]](s32)
     ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     ;
     ; GFX9-UNSAFE-LABEL: name: test_4xfloat_add_mul
@@ -917,11 +938,18 @@ body:             |
     ; GFX9-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
     ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
     ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
-    ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX9-UNSAFE-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9-UNSAFE-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C]](s32)
+    ; GFX9-UNSAFE-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-UNSAFE-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C1]](s32)
+    ; GFX9-UNSAFE-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX9-UNSAFE-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C2]](s32)
+    ; GFX9-UNSAFE-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; GFX9-UNSAFE-NEXT: [[EVEC3:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C3]](s32)
+    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+    ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+    ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
+    ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[EVEC3]](s32)
     ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     ;
     ; GFX10-LABEL: name: test_4xfloat_add_mul
@@ -944,11 +972,18 @@ body:             |
     ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
     ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = reassoc G_FADD [[FMUL]], [[BUILD_VECTOR2]]
-    ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
-    ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX10-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C]](s32)
+    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C1]](s32)
+    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX10-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C2]](s32)
+    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; GFX10-NEXT: [[EVEC3:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C3]](s32)
+    ; GFX10-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+    ; GFX10-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+    ; GFX10-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
+    ; GFX10-NEXT: $vgpr3 = COPY [[EVEC3]](s32)
     ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     ;
     ; GFX10-CONTRACT-LABEL: name: test_4xfloat_add_mul
@@ -970,11 +1005,18 @@ body:             |
     ; GFX10-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
     ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
     ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
-    ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX10-CONTRACT-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX10-CONTRACT-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C]](s32)
+    ; GFX10-CONTRACT-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-CONTRACT-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C1]](s32)
+    ; GFX10-CONTRACT-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX10-CONTRACT-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C2]](s32)
+    ; GFX10-CONTRACT-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; GFX10-CONTRACT-NEXT: [[EVEC3:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C3]](s32)
+    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+    ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+    ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
+    ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[EVEC3]](s32)
     ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     ;
     ; GFX10-DENORM-LABEL: name: test_4xfloat_add_mul
@@ -997,11 +1039,18 @@ body:             |
     ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
     ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = reassoc G_FADD [[FMUL]], [[BUILD_VECTOR2]]
-    ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
-    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX10-DENORM-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX10-DENORM-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C]](s32)
+    ; GFX10-DENORM-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-DENORM-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C1]](s32)
+    ; GFX10-DENORM-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX10-DENORM-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C2]](s32)
+    ; GFX10-DENORM-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; GFX10-DENORM-NEXT: [[EVEC3:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C3]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[EVEC3]](s32)
     ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     ;
     ; GFX10-UNSAFE-LABEL: name: test_4xfloat_add_mul
@@ -1023,11 +1072,18 @@ body:             |
     ; GFX10-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
     ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
     ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
-    ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX10-UNSAFE-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX10-UNSAFE-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C]](s32)
+    ; GFX10-UNSAFE-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-UNSAFE-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C1]](s32)
+    ; GFX10-UNSAFE-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX10-UNSAFE-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C2]](s32)
+    ; GFX10-UNSAFE-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; GFX10-UNSAFE-NEXT: [[EVEC3:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C3]](s32)
+    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+    ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+    ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
+    ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[EVEC3]](s32)
     ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     %4:_(s32) = COPY $vgpr0
     %5:_(s32) = COPY $vgpr1
@@ -1077,10 +1133,15 @@ body:             |
     ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
     ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
-    ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C]](s32)
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C1]](s32)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX9-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C2]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+    ; GFX9-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+    ; GFX9-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
     ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     ;
     ; GFX9-CONTRACT-LABEL: name: test_3xfloat_add_mul_rhs
@@ -1099,10 +1160,15 @@ body:             |
     ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
     ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
     ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
-    ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>)
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX9-CONTRACT-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9-CONTRACT-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C]](s32)
+    ; GFX9-CONTRACT-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-CONTRACT-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C1]](s32)
+    ; GFX9-CONTRACT-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX9-CONTRACT-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C2]](s32)
+    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+    ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+    ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
     ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     ;
     ; GFX9-DENORM-LABEL: name: test_3xfloat_add_mul_rhs
@@ -1122,10 +1188,15 @@ body:             |
     ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
     ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
-    ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
-    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX9-DENORM-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9-DENORM-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C]](s32)
+    ; GFX9-DENORM-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-DENORM-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C1]](s32)
+    ; GFX9-DENORM-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX9-DENORM-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C2]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
     ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     ;
     ; GFX9-UNSAFE-LABEL: name: test_3xfloat_add_mul_rhs
@@ -1144,10 +1215,15 @@ body:             |
     ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
     ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
     ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
-    ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>)
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX9-UNSAFE-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9-UNSAFE-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C]](s32)
+    ; GFX9-UNSAFE-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-UNSAFE-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C1]](s32)
+    ; GFX9-UNSAFE-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX9-UNSAFE-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C2]](s32)
+    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+    ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+    ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
     ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     ;
     ; GFX10-LABEL: name: test_3xfloat_add_mul_rhs
@@ -1167,10 +1243,15 @@ body:             |
     ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
     ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
-    ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
-    ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX10-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C]](s32)
+    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C1]](s32)
+    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX10-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C2]](s32)
+    ; GFX10-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+    ; GFX10-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+    ; GFX10-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
     ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     ;
     ; GFX10-CONTRACT-LABEL: name: test_3xfloat_add_mul_rhs
@@ -1189,10 +1270,15 @@ body:             |
     ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
     ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
     ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
-    ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>)
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX10-CONTRACT-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX10-CONTRACT-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C]](s32)
+    ; GFX10-CONTRACT-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-CONTRACT-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C1]](s32)
+    ; GFX10-CONTRACT-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX10-CONTRACT-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C2]](s32)
+    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+    ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+    ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
     ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     ;
     ; GFX10-DENORM-LABEL: name: test_3xfloat_add_mul_rhs
@@ -1212,10 +1298,15 @@ body:             |
     ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
     ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
-    ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
-    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX10-DENORM-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX10-DENORM-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C]](s32)
+    ; GFX10-DENORM-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-DENORM-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C1]](s32)
+    ; GFX10-DENORM-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX10-DENORM-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C2]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
     ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     ;
     ; GFX10-UNSAFE-LABEL: name: test_3xfloat_add_mul_rhs
@@ -1234,10 +1325,15 @@ body:             |
     ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
     ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
     ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
-    ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>)
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX10-UNSAFE-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX10-UNSAFE-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C]](s32)
+    ; GFX10-UNSAFE-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-UNSAFE-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C1]](s32)
+    ; GFX10-UNSAFE-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX10-UNSAFE-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C2]](s32)
+    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+    ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+    ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
     ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     %4:_(s32) = COPY $vgpr0
     %5:_(s32) = COPY $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
index 2845a632a84b36..5777ecfce459fb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
@@ -15,8 +15,9 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr2_vgpr3
-    ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
-    ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64)
+    ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
     ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], %el1
     ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
     %0:_(s32) = COPY $vgpr0
@@ -45,8 +46,9 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr2_vgpr3
-    ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
-    ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64)
+    ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
     ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], %el1
     ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
     %0:_(s32) = COPY $vgpr0
@@ -77,8 +79,9 @@ body: |
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
     ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
-    ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64)
+    ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
     ; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
     ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
     ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT]], [[FPEXT1]], %el1
@@ -114,8 +117,9 @@ body: |
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
     ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
-    ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64)
+    ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
     ; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
     ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
     ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT]], [[FPEXT1]], %el1
@@ -147,8 +151,9 @@ body: |
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
     ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr4_vgpr5
-    ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
-    ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64)
+    ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
     ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY2]], [[COPY3]], %el1
     ; GFX10-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[FMA]]
     ; GFX10-NEXT: $vgpr0 = COPY [[FMA1]](s32)
@@ -179,8 +184,9 @@ body: |
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
     ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr4_vgpr5
-    ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
-    ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64)
+    ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
     ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY2]], [[COPY3]], %el1
     ; GFX10-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[FMA]]
     ; GFX10-NEXT: $vgpr0 = COPY [[FMA1]](s32)
@@ -213,8 +219,9 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr2_vgpr3
-    ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
-    ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64)
+    ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr5
@@ -258,8 +265,9 @@ body: |
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr2_vgpr3
-    ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
-    ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64)
+    ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr5
@@ -304,8 +312,9 @@ body: |
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
-    ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64)
+    ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr3
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr4
@@ -347,8 +356,9 @@ body: |
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
-    ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64)
+    ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr3
@@ -399,8 +409,9 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
-    ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64)
+    ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
     ; GFX10-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG %el1
     ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[FNEG]]
     ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
@@ -430,8 +441,9 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr2_vgpr3
-    ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
-    ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64)
+    ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
     ; GFX10-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]]
     ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[COPY1]], %el1
     ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll
index 9eeb633f0a817c..e91251186a18d3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll
@@ -68,8 +68,20 @@ define <2 x i16> @halfinsts_add_v2i16(<2 x i16> %arg0) #1 {
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY]]
   ; CHECK-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY1]]
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ADD]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[ADD1]](s32)
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+  ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C]]
+  ; CHECK-NEXT:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C]]
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
+  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+  ; CHECK-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+  ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; CHECK-NEXT:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST]](<2 x s16>)
+  ; CHECK-NEXT:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C2]](s32)
+  ; CHECK-NEXT:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST]](<2 x s16>)
+  ; CHECK-NEXT:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C1]](s32)
+  ; CHECK-NEXT:   $vgpr0 = COPY [[LSHR]](s32)
+  ; CHECK-NEXT:   $vgpr1 = COPY [[LSHR1]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
   %add = add <2 x i16> %arg0, %arg0
   ret <2 x i16> %add
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 5ba036c386a402..d723ccccda6953 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -774,20 +774,23 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
 ; GFX6-IEEE-NEXT:    v_fma_f32 v7, v8, v5, v7
 ; GFX6-IEEE-NEXT:    v_fma_f32 v4, -v4, v7, v6
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v4, v4, v5, v7
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, s[4:5], v3, v3, v1
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v6, v5
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v3, v3, v1
-; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v2
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v1, v3, v1
-; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v2, v4, 1.0
-; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v2, v6, v5
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, vcc, v1, v3, v1
+; GFX6-IEEE-NEXT:    v_fma_f32 v4, -v5, v6, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v4, v4, v6, v6
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v2, v4
+; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v5, v6, v2
 ; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v6, v5
+; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v5, v6, v2
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-IEEE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-IEEE-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_fdiv_v2f16:
@@ -826,6 +829,9 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-FLUSH-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-FLUSH-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-IEEE-LABEL: v_fdiv_v2f16:
@@ -1076,16 +1082,19 @@ define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) {
 ; GFX6-LABEL: v_fdiv_v2f16_afn:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_v2f16_afn:
@@ -1152,20 +1161,23 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX6-IEEE-NEXT:    v_fma_f32 v7, v8, v5, v7
 ; GFX6-IEEE-NEXT:    v_fma_f32 v4, -v4, v7, v6
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v4, v4, v5, v7
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, s[4:5], v3, v3, v1
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v6, v5
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v3, v3, v1
-; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v2
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v1, v3, v1
-; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v2, v4, 1.0
-; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v2, v6, v5
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, vcc, v1, v3, v1
+; GFX6-IEEE-NEXT:    v_fma_f32 v4, -v5, v6, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v4, v4, v6, v6
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v2, v4
+; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v5, v6, v2
 ; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v6, v5
+; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v5, v6, v2
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-IEEE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-IEEE-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_fdiv_v2f16_ulp25:
@@ -1204,6 +1216,9 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-FLUSH-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-FLUSH-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-IEEE-LABEL: v_fdiv_v2f16_ulp25:
@@ -1467,20 +1482,23 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
 ; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
 ; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v6, v5
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, s[4:5], v1, v1, v2
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v5, v4
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v2, v1, v2
-; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, v6, v5, v5
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v3, v5
+; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v4, v6, v3
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v5, v6
+; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v4, v6, v3
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, v2
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-IEEE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-IEEE-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_rcp_v2f16:
@@ -1519,6 +1537,9 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, v4
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-FLUSH-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-FLUSH-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-IEEE-LABEL: v_rcp_v2f16:
@@ -1770,20 +1791,23 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
 ; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
 ; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v6, v5
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, s[4:5], v1, v1, v2
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v5, v4
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v2, v1, v2
-; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, v6, v5, v5
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v3, v5
+; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v4, v6, v3
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v5, v6
+; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v4, v6, v3
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, v2
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-IEEE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-IEEE-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_neg_rcp_v2f16:
@@ -1822,6 +1846,9 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, v4
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-FLUSH-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-FLUSH-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-IEEE-LABEL: v_neg_rcp_v2f16:
@@ -2067,6 +2094,7 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; GFX6-IEEE-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v2, v2, v1
 ; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v1, v2, v1
@@ -2076,22 +2104,24 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
 ; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v3, v6, v5
 ; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
 ; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v5, v0
-; GFX6-IEEE-NEXT:    v_div_fmas_f32 v0, v3, v4, v6
-; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v0, v2, v1
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v5, v4
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v2, v3, v2, v1
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, v6, v5, v5
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v3, v5
+; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v4, v6, v3
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v5, v6
+; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v4, v6, v3
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v3, v0, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v5, v5, v1
-; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, vcc, v1, v5, v1
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-IEEE-NEXT:    v_fma_f32 v3, v6, v3, v3
-; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v4, v3
-; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v2, v6, v4
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v3, v6
-; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v6, v4
-; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v2, v5, v1
-; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; GFX6-IEEE-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-IEEE-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX6-IEEE-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_rcp_v2f16_fabs:
@@ -2117,24 +2147,27 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v5, v0
-; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v3, v2, v1
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v3, v2, v1
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
-; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v1, s[4:5], v5, v5, v4
-; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, vcc, v4, v5, v4
+; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v4
+; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v4, v0, v4
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT:    v_fma_f32 v2, v6, v2, v2
-; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v3, v2
-; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v1, v6, v3
-; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v2, v6
-; GFX6-FLUSH-NEXT:    v_fma_f32 v1, -v1, v6, v3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v6, v3, v3
+; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v2, v6, v5
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v3, v6
+; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v6, v5
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v1, v1, v2, v6
-; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v1, v5, v4
-; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, v4
+; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-FLUSH-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX6-FLUSH-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-IEEE-LABEL: v_rcp_v2f16_fabs:
@@ -2389,6 +2422,7 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, -1.0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; GFX6-IEEE-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v2, v2, v1
 ; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v1, v2, v1
@@ -2398,22 +2432,24 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
 ; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v3, v6, v5
 ; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
 ; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v5, v0
-; GFX6-IEEE-NEXT:    v_div_fmas_f32 v0, v3, v4, v6
-; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v0, v2, v1
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v5, v4
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v2, v3, v2, v1
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, v6, v5, v5
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v3, v5
+; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v4, v6, v3
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v5, v6
+; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v4, v6, v3
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v3, v0, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v5, v5, v1
-; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, vcc, v1, v5, v1
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-IEEE-NEXT:    v_fma_f32 v3, v6, v3, v3
-; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v4, v3
-; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v2, v6, v4
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v3, v6
-; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v6, v4
-; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v2, v5, v1
-; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; GFX6-IEEE-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-IEEE-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX6-IEEE-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_neg_rcp_v2f16_fabs:
@@ -2439,24 +2475,27 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v5, v0
-; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v3, v2, v1
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v3, v2, v1
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
-; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v1, s[4:5], v5, v5, v4
-; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, vcc, v4, v5, v4
+; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v4
+; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v4, v0, v4
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT:    v_fma_f32 v2, v6, v2, v2
-; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v3, v2
-; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v1, v6, v3
-; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v2, v6
-; GFX6-FLUSH-NEXT:    v_fma_f32 v1, -v1, v6, v3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v6, v3, v3
+; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v2, v6, v5
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v3, v6
+; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v6, v5
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v1, v1, v2, v6
-; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v1, v5, v4
-; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, v4
+; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-FLUSH-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX6-FLUSH-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-IEEE-LABEL: v_neg_rcp_v2f16_fabs:
@@ -2717,20 +2756,23 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
 ; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
 ; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v6, v5
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, s[4:5], v1, v1, v2
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v5, v4
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v2, v1, v2
-; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, v6, v5, v5
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v3, v5
+; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v4, v6, v3
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v5, v6
+; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v4, v6, v3
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, v2
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-IEEE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-IEEE-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp:
@@ -2769,6 +2811,9 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, v4
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-FLUSH-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-FLUSH-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_rcp_v2f16_arcp:
@@ -2812,15 +2857,18 @@ define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
 ; GFX6-LABEL: v_rcp_v2f16_arcp_afn:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
-; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, v2, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_rcp_v2f16_arcp_afn:
@@ -2877,20 +2925,23 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
 ; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
 ; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v6, v5
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, s[4:5], v1, v1, v2
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v5, v4
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v2, v1, v2
-; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, v6, v5, v5
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v3, v5
+; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v4, v6, v3
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v5, v6
+; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v4, v6, v3
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, v2
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-IEEE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-IEEE-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_rcp_v2f16_ulp25:
@@ -2929,6 +2980,9 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, v4
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-FLUSH-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-FLUSH-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-IEEE-LABEL: v_rcp_v2f16_ulp25:
@@ -3167,16 +3221,19 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX6-LABEL: v_fdiv_v2f16_afn_ulp25:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_v2f16_afn_ulp25:
@@ -3243,20 +3300,23 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX6-IEEE-NEXT:    v_fma_f32 v7, v8, v5, v7
 ; GFX6-IEEE-NEXT:    v_fma_f32 v4, -v4, v7, v6
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v4, v4, v5, v7
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, s[4:5], v3, v3, v1
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v6, v5
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v3, v3, v1
-; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v2
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v1, v3, v1
-; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v2, v4, 1.0
-; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v2, v6, v5
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, vcc, v1, v3, v1
+; GFX6-IEEE-NEXT:    v_fma_f32 v4, -v5, v6, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v4, v4, v6, v6
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v2, v4
+; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v5, v6, v2
 ; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v6, v5
+; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v5, v6, v2
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-IEEE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-IEEE-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_fdiv_v2f16_arcp_ulp25:
@@ -3295,6 +3355,9 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-FLUSH-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-FLUSH-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_v2f16_arcp_ulp25:
@@ -3347,16 +3410,19 @@ define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX6-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
@@ -5395,8 +5461,11 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
 ; GFX6-IEEE-NEXT:    s_mov_b64 vcc, s[4:5]
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v5, v3, v4
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, v2
-; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-IEEE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-IEEE-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_rsq_v2f16:
@@ -5441,6 +5510,9 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, v4
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-FLUSH-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-FLUSH-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-IEEE-LABEL: v_rsq_v2f16:
@@ -5709,8 +5781,11 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
 ; GFX6-IEEE-NEXT:    s_mov_b64 vcc, s[4:5]
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v5, v3, v4
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, v2
-; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-IEEE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-IEEE-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_neg_rsq_v2f16:
@@ -5755,6 +5830,9 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, v4
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-FLUSH-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-FLUSH-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-IEEE-LABEL: v_neg_rsq_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
index 99e6c5d06a0e19..f3237a2612616f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
@@ -237,16 +237,19 @@ define <2 x half> @v_fma_v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z) {
 ; GFX6-LABEL: v_fma_v2f16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GFX6-NEXT:    v_fma_f32 v1, v1, v3, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_fma_f32 v0, v0, v2, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fma_v2f16:
@@ -291,16 +294,19 @@ define <2 x half> @v_fma_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y, <2 x half>
 ; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GFX6-NEXT:    v_fma_f32 v1, v1, v3, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_fma_f32 v0, v0, v2, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fma_v2f16_fneg_lhs:
@@ -347,16 +353,19 @@ define <2 x half> @v_fma_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y, <2 x half>
 ; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GFX6-NEXT:    v_fma_f32 v1, v1, v3, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_fma_f32 v0, v0, v2, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fma_v2f16_fneg_rhs:
@@ -398,16 +407,19 @@ define <2 x half> @v_fma_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y, <2 x h
 ; GFX6-LABEL: v_fma_v2f16_fneg_lhs_rhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GFX6-NEXT:    v_fma_f32 v1, v1, v3, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_fma_f32 v0, v0, v2, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fma_v2f16_fneg_lhs_rhs:
@@ -511,22 +523,28 @@ define <4 x half> @v_fma_v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z) {
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v8, v8
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v7
 ; GFX6-NEXT:    v_fma_f32 v0, v0, v4, v8
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v9
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v9, v11
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v6
-; GFX6-NEXT:    v_fma_f32 v1, v1, v5, v9
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v10
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v7
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v11
-; GFX6-NEXT:    v_fma_f32 v2, v2, v4, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v8, v10
+; GFX6-NEXT:    v_fma_f32 v1, v1, v4, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT:    v_fma_f32 v3, v3, v6, v7
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_fma_f32 v3, v3, v7, v9
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_fma_f32 v2, v2, v6, v8
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fma_v4f16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
index 543f8e413abd86..882eacafef1956 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
@@ -144,8 +144,12 @@ define <3 x half> @v_fmul_v3f16_fneg_lhs(<3 x half> %a, <3 x half> %b) {
 ; GFX8-LABEL: v_fmul_v3f16_fneg_lhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x80008000
+; GFX8-NEXT:    v_xor_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    v_mul_f16_e32 v4, v0, v2
 ; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_mul_f16_e32 v1, v1, v3
@@ -174,8 +178,12 @@ define <3 x half> @v_fmul_v3f16_fneg_rhs(<3 x half> %a, <3 x half> %b) {
 ; GFX8-LABEL: v_fmul_v3f16_fneg_rhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80008000, v3
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x80008000
+; GFX8-NEXT:    v_xor_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    v_mul_f16_e32 v4, v0, v2
 ; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_mul_f16_e32 v1, v1, v3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
index 0577117e9d9e1d..228d30a040aadf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
@@ -376,31 +376,34 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX6-LABEL: v_pow_v2f16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0xc2fc0000
+; GFX6-NEXT:    v_log_f32_e32 v1, v1
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
+; GFX6-NEXT:    v_mov_b32_e32 v4, 0xc2fc0000
 ; GFX6-NEXT:    v_mov_b32_e32 v5, 0x42800000
-; GFX6-NEXT:    v_log_f32_e32 v1, v1
+; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v2
+; GFX6-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v4
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
+; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, v5, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
+; GFX6-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, 1.0, v3, vcc
-; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v1, v2
-; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
-; GFX6-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v6
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_exp_f32_e32 v0, v0
+; GFX6-NEXT:    v_mov_b32_e32 v2, 0x1f800000
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, 1.0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 1.0, v2, s[4:5]
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_v2f16:
@@ -506,21 +509,24 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v2
+; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; GFX6-NEXT:    v_cmp_lt_f32_e64 s[4:5], v0, v2
 ; GFX6-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX6-NEXT:    v_mov_b32_e32 v5, 0x1f800000
-; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
-; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[4:5]
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX6-NEXT:    v_exp_f32_e32 v2, v0
-; GFX6-NEXT:    v_mul_f32_e32 v0, v1, v6
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v5, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
+; GFX6-NEXT:    v_exp_f32_e32 v0, v0
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x1f800000
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v5, vcc
+; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 1.0, v5, s[4:5]
+; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_v2f16_fneg_lhs:
@@ -620,9 +626,9 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v2
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
@@ -632,21 +638,24 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_mov_b32_e32 v3, 0xc2fc0000
 ; GFX6-NEXT:    v_mov_b32_e32 v4, 0x42800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
+; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; GFX6-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v3
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v5
-; GFX6-NEXT:    v_mov_b32_e32 v5, 0x1f800000
-; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v1, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
-; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; GFX6-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[4:5]
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
+; GFX6-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x1f800000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v5, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v6
+; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 1.0, v5, s[4:5]
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_v2f16_fneg_rhs:
@@ -748,11 +757,11 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX6-NEXT:    v_log_f32_e32 v3, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -762,21 +771,24 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_mov_b32_e32 v3, 0xc2fc0000
 ; GFX6-NEXT:    v_mov_b32_e32 v4, 0x42800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v3
+; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; GFX6-NEXT:    v_cmp_lt_f32_e64 s[4:5], v0, v3
 ; GFX6-NEXT:    v_add_f32_e32 v2, v2, v5
-; GFX6-NEXT:    v_mov_b32_e32 v5, 0x1f800000
-; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
-; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, v4, s[4:5]
 ; GFX6-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX6-NEXT:    v_exp_f32_e32 v1, v0
-; GFX6-NEXT:    v_mul_f32_e32 v0, v2, v6
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v5, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT:    v_exp_f32_e32 v0, v0
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x1f800000
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v5, vcc
+; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 1.0, v5, s[4:5]
+; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_v2f16_fneg_lhs_rhs:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 3bd3486ec261d4..3dc014a3588dd2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -3983,6 +3983,11 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_v2i16:
@@ -4063,6 +4068,11 @@ define <2 x i16> @v_fshl_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 7, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_v2i16_4_8:
@@ -5037,7 +5047,17 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX6-NEXT:    v_bfe_u32 v4, v7, 1, 15
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v5, v4
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v1, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 58304d2072d7f6..b12ad74462e7ef 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -3763,6 +3763,11 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_v2i16:
@@ -3852,6 +3857,11 @@ define <2 x i16> @v_fshr_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 7, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_v2i16_4_8:
@@ -4341,6 +4351,10 @@ define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
 ;
 ; GFX8-LABEL: s_fshr_v3i16:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_lshr_b32 s8, s4, 16
+; GFX8-NEXT:    s_and_b32 s4, s4, 0xffff
+; GFX8-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX8-NEXT:    s_or_b32 s4, s4, s8
 ; GFX8-NEXT:    s_and_b32 s8, 0xffff, s2
 ; GFX8-NEXT:    s_lshr_b32 s6, s0, 16
 ; GFX8-NEXT:    s_lshr_b32 s7, s2, 16
@@ -4373,6 +4387,7 @@ define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
 ; GFX8-NEXT:    s_lshr_b32 s4, s6, s4
 ; GFX8-NEXT:    s_or_b32 s2, s2, s4
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s3
+; GFX8-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 1
 ; GFX8-NEXT:    s_lshr_b32 s4, s4, 15
 ; GFX8-NEXT:    s_or_b32 s1, s1, s4
@@ -4593,6 +4608,9 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
 ; GFX8-LABEL: v_fshr_v3i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 1, v0
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v7, 15, v2
 ; GFX8-NEXT:    v_or_b32_e32 v6, v6, v7
@@ -4623,7 +4641,7 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 15, v3
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 1, v3
-; GFX8-NEXT:    v_xor_b32_e32 v3, -1, v5
+; GFX8-NEXT:    v_xor_b32_sdwa v3, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_e32 v4, 15, v3
 ; GFX8-NEXT:    v_xor_b32_e32 v3, -1, v3
 ; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
@@ -5013,36 +5031,46 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff, v8
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX6-NEXT:    v_bfe_u32 v4, v6, 1, 15
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 14, v4
-; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX6-NEXT:    v_bfe_u32 v4, v7, 1, 15
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 1, v3
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 14, v4
-; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 1, v6
-; GFX6-NEXT:    v_xor_b32_e32 v6, -1, v9
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 1, v7
-; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GFX6-NEXT:    v_and_b32_e32 v8, 15, v6
-; GFX6-NEXT:    v_xor_b32_e32 v6, -1, v6
-; GFX6-NEXT:    v_and_b32_e32 v6, 15, v6
-; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX6-NEXT:    v_bfe_u32 v4, v4, 1, 15
-; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v8, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v6, v4
-; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX6-NEXT:    v_and_b32_e32 v4, 15, v7
-; GFX6-NEXT:    v_xor_b32_e32 v6, -1, v7
-; GFX6-NEXT:    v_and_b32_e32 v6, 15, v6
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v4, v3
-; GFX6-NEXT:    v_bfe_u32 v4, v5, 1, 15
-; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff, v6
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v5, v4
-; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v2
+; GFX6-NEXT:    v_bfe_u32 v2, v6, 1, 15
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 14, v2
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 1, v3
+; GFX6-NEXT:    v_bfe_u32 v3, v7, 1, 15
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 14, v3
+; GFX6-NEXT:    v_xor_b32_e32 v5, -1, v9
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 1, v6
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 1, v7
+; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; GFX6-NEXT:    v_and_b32_e32 v7, 15, v5
+; GFX6-NEXT:    v_xor_b32_e32 v5, -1, v5
+; GFX6-NEXT:    v_and_b32_e32 v5, 15, v5
+; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX6-NEXT:    v_bfe_u32 v3, v3, 1, 15
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v7, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v3
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX6-NEXT:    v_and_b32_e32 v3, 15, v6
+; GFX6-NEXT:    v_xor_b32_e32 v5, -1, v6
+; GFX6-NEXT:    v_and_b32_e32 v5, 15, v5
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
+; GFX6-NEXT:    v_bfe_u32 v3, v4, 1, 15
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v4, v3
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v1, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll
index 8e4e4cf2c5b87f..cd02df5882ca1b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll
@@ -27,10 +27,14 @@ define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords)
   ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: load_1d
@@ -55,10 +59,14 @@ define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords)
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: load_1d
@@ -83,10 +91,14 @@ define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords)
   ; GFX12-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -120,10 +132,14 @@ define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords)
   ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: load_2d
@@ -151,10 +167,14 @@ define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords)
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: load_2d
@@ -182,10 +202,14 @@ define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords)
   ; GFX12-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -227,10 +251,14 @@ define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: load_3d
@@ -265,10 +293,14 @@ define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l
   ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: load_3d
@@ -302,10 +334,14 @@ define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l
   ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -348,10 +384,14 @@ define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: load_cube
@@ -386,10 +426,14 @@ define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords
   ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: load_cube
@@ -423,10 +467,14 @@ define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords
   ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.cube), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -462,10 +510,14 @@ define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coo
   ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1darray), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: load_1darray
@@ -493,10 +545,14 @@ define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coo
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1darray), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: load_1darray
@@ -524,10 +580,14 @@ define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coo
   ; GFX12-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1darray), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -569,10 +629,14 @@ define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coo
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: load_2darray
@@ -607,10 +671,14 @@ define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coo
   ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: load_2darray
@@ -644,10 +712,14 @@ define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coo
   ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -690,10 +762,14 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2dmsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: load_2dmsaa
@@ -728,10 +804,14 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2dmsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: load_2dmsaa
@@ -765,10 +845,14 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2dmsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -813,10 +897,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: load_2darraymsaa
@@ -853,10 +941,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: load_2darraymsaa
@@ -892,10 +984,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -932,10 +1028,14 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: load_mip_1d
@@ -963,10 +1063,14 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: load_mip_1d
@@ -994,10 +1098,14 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX12-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -1039,10 +1147,14 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: load_mip_2d
@@ -1077,10 +1189,14 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: load_mip_2d
@@ -1114,10 +1230,14 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -1162,10 +1282,14 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: load_mip_3d
@@ -1202,10 +1326,14 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: load_mip_3d
@@ -1241,10 +1369,14 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -1290,10 +1422,14 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: load_mip_cube
@@ -1330,10 +1466,14 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: load_mip_cube
@@ -1369,10 +1509,14 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -1416,10 +1560,14 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: load_mip_1darray
@@ -1454,10 +1602,14 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: load_mip_1darray
@@ -1491,10 +1643,14 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -1539,10 +1695,14 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: load_mip_2darray
@@ -1579,10 +1739,14 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: load_mip_2darray
@@ -1618,10 +1782,14 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -3283,10 +3451,14 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: getresinfo_1d
@@ -3311,10 +3483,14 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: getresinfo_1d
@@ -3339,10 +3515,14 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX12-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %mip = extractelement <2 x i16> %coords, i32 0
@@ -3373,10 +3553,14 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: getresinfo_2d
@@ -3401,10 +3585,14 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: getresinfo_2d
@@ -3429,10 +3617,14 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX12-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %mip = extractelement <2 x i16> %coords, i32 0
@@ -3463,10 +3655,14 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.3d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: getresinfo_3d
@@ -3491,10 +3687,14 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.3d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: getresinfo_3d
@@ -3519,10 +3719,14 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX12-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.3d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %mip = extractelement <2 x i16> %coords, i32 0
@@ -3553,10 +3757,14 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> %
   ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.cube), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: getresinfo_cube
@@ -3581,10 +3789,14 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> %
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.cube), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: getresinfo_cube
@@ -3609,10 +3821,14 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> %
   ; GFX12-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.cube), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %mip = extractelement <2 x i16> %coords, i32 0
@@ -3643,10 +3859,14 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16
   ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: getresinfo_1darray
@@ -3671,10 +3891,14 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: getresinfo_1darray
@@ -3699,10 +3923,14 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16
   ; GFX12-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %mip = extractelement <2 x i16> %coords, i32 0
@@ -3733,10 +3961,14 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16
   ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: getresinfo_2darray
@@ -3761,10 +3993,14 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: getresinfo_2darray
@@ -3789,10 +4025,14 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16
   ; GFX12-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %mip = extractelement <2 x i16> %coords, i32 0
@@ -3823,10 +4063,14 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2dmsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: getresinfo_2dmsaa
@@ -3851,10 +4095,14 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2dmsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: getresinfo_2dmsaa
@@ -3879,10 +4127,14 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX12-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2dmsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %mip = extractelement <2 x i16> %coords, i32 0
@@ -3913,10 +4165,14 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x
   ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darraymsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: getresinfo_2darraymsaa
@@ -3941,10 +4197,14 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darraymsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: getresinfo_2darraymsaa
@@ -3969,10 +4229,14 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x
   ; GFX12-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darraymsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %mip = extractelement <2 x i16> %coords, i32 0
@@ -4081,8 +4345,10 @@ define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coord
   ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 9, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<2 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
   ;
   ; GFX10NSA-LABEL: name: load_1d_V2
@@ -4107,8 +4373,10 @@ define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coord
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 9, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<2 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
   ;
   ; GFX12-LABEL: name: load_1d_V2
@@ -4133,8 +4401,10 @@ define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coord
   ; GFX12-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 9, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<2 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -4327,10 +4597,14 @@ define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: load_1d_glc
@@ -4355,10 +4629,14 @@ define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: load_1d_glc
@@ -4383,10 +4661,14 @@ define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX12-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -4417,10 +4699,14 @@ define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: load_1d_slc
@@ -4445,10 +4731,14 @@ define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: load_1d_slc
@@ -4473,10 +4763,14 @@ define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX12-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -4507,10 +4801,14 @@ define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> %
   ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: load_1d_glc_slc
@@ -4535,10 +4833,14 @@ define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> %
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: load_1d_glc_slc
@@ -4563,10 +4865,14 @@ define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> %
   ; GFX12-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -4851,10 +5157,14 @@ define amdgpu_ps <4 x float> @getresinfo_dmask0(<8 x i32> inreg %rsrc, <4 x floa
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT:   [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY1]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY2]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY3]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: getresinfo_dmask0
@@ -4863,10 +5173,14 @@ define amdgpu_ps <4 x float> @getresinfo_dmask0(<8 x i32> inreg %rsrc, <4 x floa
   ; GFX10NSA-NEXT: {{  $}}
   ; GFX10NSA-NEXT:   [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<4 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY1]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY2]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY3]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: getresinfo_dmask0
@@ -4875,10 +5189,14 @@ define amdgpu_ps <4 x float> @getresinfo_dmask0(<8 x i32> inreg %rsrc, <4 x floa
   ; GFX12-NEXT: {{  $}}
   ; GFX12-NEXT:   [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY1]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY2]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY3]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %mip = extractelement <2 x i16> %coords, i32 0
@@ -4911,10 +5229,14 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
   ; GFX9-NEXT:   G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: load_1d_tfe
@@ -4941,10 +5263,14 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
   ; GFX10NSA-NEXT:   G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: load_1d_tfe
@@ -4971,10 +5297,14 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
   ; GFX12-NEXT:   G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -5013,10 +5343,14 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
   ; GFX9-NEXT:   G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: load_2d_tfe
@@ -5046,10 +5380,14 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
   ; GFX10NSA-NEXT:   G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: load_2d_tfe
@@ -5079,10 +5417,14 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
   ; GFX12-NEXT:   G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY9]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY12]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -5129,10 +5471,14 @@ define amdgpu_ps <4 x float> @load_3d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
   ; GFX9-NEXT:   G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: load_3d_tfe
@@ -5169,10 +5515,14 @@ define amdgpu_ps <4 x float> @load_3d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
   ; GFX10NSA-NEXT:   G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: load_3d_tfe
@@ -5208,10 +5558,14 @@ define amdgpu_ps <4 x float> @load_3d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
   ; GFX12-NEXT:   G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -5261,10 +5615,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, <2 x i
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
   ; GFX9-NEXT:   G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: load_2darraymsaa_tfe
@@ -5303,10 +5661,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, <2 x i
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
   ; GFX10NSA-NEXT:   G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: load_2darraymsaa_tfe
@@ -5344,10 +5706,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, <2 x i
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
   ; GFX12-NEXT:   G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll
index f61f985cd24ab1..294172336aef03 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll
@@ -119,6 +119,7 @@ define amdgpu_ps <3 x half> @image_load_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32
   ; UNPACKED-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD_D16_:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD_D16 intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<3 x s16>), align 8, addrspace 8)
   ; UNPACKED-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<3 x s32>)
+  ; UNPACKED-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
   ; UNPACKED-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
   ; UNPACKED-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C]]
   ; UNPACKED-NEXT:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C]]
@@ -126,7 +127,7 @@ define amdgpu_ps <3 x half> @image_load_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32
   ; UNPACKED-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
   ; UNPACKED-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
   ; UNPACKED-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-  ; UNPACKED-NEXT:   [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV2]], [[C]]
+  ; UNPACKED-NEXT:   [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]]
   ; UNPACKED-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; UNPACKED-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C1]](s32)
   ; UNPACKED-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
@@ -363,6 +364,7 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16(<8 x i32> inreg %rsrc, i32 %s,
   ; UNPACKED-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD_D16_:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD_D16 intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (<3 x s16>), align 8, addrspace 8)
   ; UNPACKED-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<4 x s32>)
   ; UNPACKED-NEXT:   G_STORE [[UV3]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+  ; UNPACKED-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
   ; UNPACKED-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
   ; UNPACKED-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C]]
   ; UNPACKED-NEXT:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C]]
@@ -370,7 +372,7 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16(<8 x i32> inreg %rsrc, i32 %s,
   ; UNPACKED-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
   ; UNPACKED-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
   ; UNPACKED-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-  ; UNPACKED-NEXT:   [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV2]], [[C]]
+  ; UNPACKED-NEXT:   [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]]
   ; UNPACKED-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; UNPACKED-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C1]](s32)
   ; UNPACKED-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
@@ -598,6 +600,8 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1100(<8 x i32> inreg %rsrc,
   ; UNPACKED-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD_D16_:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD_D16 intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<2 x s16>), addrspace 8)
   ; UNPACKED-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<2 x s32>)
+  ; UNPACKED-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; UNPACKED-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
   ; UNPACKED-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
   ; UNPACKED-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C]]
   ; UNPACKED-NEXT:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C]]
@@ -605,9 +609,10 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1100(<8 x i32> inreg %rsrc,
   ; UNPACKED-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
   ; UNPACKED-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
   ; UNPACKED-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+  ; UNPACKED-NEXT:   [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]]
   ; UNPACKED-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; UNPACKED-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C1]](s32)
-  ; UNPACKED-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]]
+  ; UNPACKED-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
   ; UNPACKED-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
   ; UNPACKED-NEXT:   $vgpr0 = COPY [[BITCAST]](<2 x s16>)
   ; UNPACKED-NEXT:   $vgpr1 = COPY [[BITCAST1]](<2 x s16>)
@@ -630,10 +635,14 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1100(<8 x i32> inreg %rsrc,
   ; PACKED-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; PACKED-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; PACKED-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD_D16_:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD_D16 intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<2 x s16>), addrspace 8)
-  ; PACKED-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; PACKED-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; PACKED-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C1]](s32)
-  ; PACKED-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]]
+  ; PACKED-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; PACKED-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+  ; PACKED-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+  ; PACKED-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]]
+  ; PACKED-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; PACKED-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; PACKED-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32)
+  ; PACKED-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
   ; PACKED-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
   ; PACKED-NEXT:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<2 x s16>)
   ; PACKED-NEXT:   $vgpr1 = COPY [[BITCAST]](<2 x s16>)
@@ -660,6 +669,9 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1000(<8 x i32> inreg %rsrc,
   ; UNPACKED-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; UNPACKED-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD_D16_:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD_D16 intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (s16), addrspace 8)
+  ; UNPACKED-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; UNPACKED-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+  ; UNPACKED-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
   ; UNPACKED-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
   ; UNPACKED-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]], [[C]]
   ; UNPACKED-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
@@ -667,7 +679,8 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1000(<8 x i32> inreg %rsrc,
   ; UNPACKED-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32)
   ; UNPACKED-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
   ; UNPACKED-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-  ; UNPACKED-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[C1]], [[SHL]]
+  ; UNPACKED-NEXT:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C]]
+  ; UNPACKED-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]]
   ; UNPACKED-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
   ; UNPACKED-NEXT:   $vgpr0 = COPY [[BITCAST]](<2 x s16>)
   ; UNPACKED-NEXT:   $vgpr1 = COPY [[BITCAST1]](<2 x s16>)
@@ -690,10 +703,14 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1000(<8 x i32> inreg %rsrc,
   ; PACKED-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; PACKED-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; PACKED-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD_D16_:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD_D16 intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (s16), addrspace 8)
-  ; PACKED-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; PACKED-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; PACKED-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C1]](s32)
-  ; PACKED-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]]
+  ; PACKED-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; PACKED-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+  ; PACKED-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+  ; PACKED-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]]
+  ; PACKED-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; PACKED-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; PACKED-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32)
+  ; PACKED-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
   ; PACKED-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
   ; PACKED-NEXT:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<2 x s16>)
   ; PACKED-NEXT:   $vgpr1 = COPY [[BITCAST]](<2 x s16>)
@@ -1145,6 +1162,8 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1100(<8 x i32> inreg %rs
   ; UNPACKED-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD_D16_:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD_D16 intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (<2 x s16>), addrspace 8)
   ; UNPACKED-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<3 x s32>)
   ; UNPACKED-NEXT:   G_STORE [[UV2]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+  ; UNPACKED-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; UNPACKED-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
   ; UNPACKED-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
   ; UNPACKED-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C]]
   ; UNPACKED-NEXT:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C]]
@@ -1152,9 +1171,10 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1100(<8 x i32> inreg %rs
   ; UNPACKED-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
   ; UNPACKED-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
   ; UNPACKED-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+  ; UNPACKED-NEXT:   [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]]
   ; UNPACKED-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; UNPACKED-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C1]](s32)
-  ; UNPACKED-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]]
+  ; UNPACKED-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
   ; UNPACKED-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
   ; UNPACKED-NEXT:   $vgpr0 = COPY [[BITCAST]](<2 x s16>)
   ; UNPACKED-NEXT:   $vgpr1 = COPY [[BITCAST1]](<2 x s16>)
@@ -1181,10 +1201,14 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1100(<8 x i32> inreg %rs
   ; PACKED-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<2 x s32>)
   ; PACKED-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32)
   ; PACKED-NEXT:   G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; PACKED-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; PACKED-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; PACKED-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C1]](s32)
-  ; PACKED-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]]
+  ; PACKED-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; PACKED-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+  ; PACKED-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+  ; PACKED-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]]
+  ; PACKED-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; PACKED-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; PACKED-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32)
+  ; PACKED-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
   ; PACKED-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
   ; PACKED-NEXT:   $vgpr0 = COPY [[BITCAST]](<2 x s16>)
   ; PACKED-NEXT:   $vgpr1 = COPY [[BITCAST1]](<2 x s16>)
@@ -1217,6 +1241,9 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1000(<8 x i32> inreg %rs
   ; UNPACKED-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD_D16_:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD_D16 intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (s16), addrspace 8)
   ; UNPACKED-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<2 x s32>)
   ; UNPACKED-NEXT:   G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+  ; UNPACKED-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; UNPACKED-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+  ; UNPACKED-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
   ; UNPACKED-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
   ; UNPACKED-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C]]
   ; UNPACKED-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
@@ -1224,7 +1251,8 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1000(<8 x i32> inreg %rs
   ; UNPACKED-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32)
   ; UNPACKED-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
   ; UNPACKED-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-  ; UNPACKED-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[C1]], [[SHL]]
+  ; UNPACKED-NEXT:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C]]
+  ; UNPACKED-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]]
   ; UNPACKED-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
   ; UNPACKED-NEXT:   $vgpr0 = COPY [[BITCAST]](<2 x s16>)
   ; UNPACKED-NEXT:   $vgpr1 = COPY [[BITCAST1]](<2 x s16>)
@@ -1251,10 +1279,14 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1000(<8 x i32> inreg %rs
   ; PACKED-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<2 x s32>)
   ; PACKED-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32)
   ; PACKED-NEXT:   G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; PACKED-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; PACKED-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; PACKED-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C1]](s32)
-  ; PACKED-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]]
+  ; PACKED-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; PACKED-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+  ; PACKED-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+  ; PACKED-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]]
+  ; PACKED-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; PACKED-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; PACKED-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32)
+  ; PACKED-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
   ; PACKED-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
   ; PACKED-NEXT:   $vgpr0 = COPY [[BITCAST]](<2 x s16>)
   ; PACKED-NEXT:   $vgpr1 = COPY [[BITCAST1]](<2 x s16>)
@@ -1287,6 +1319,9 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_0000(<8 x i32> inreg %rs
   ; UNPACKED-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD_D16_:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD_D16 intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (s16), addrspace 8)
   ; UNPACKED-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<2 x s32>)
   ; UNPACKED-NEXT:   G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+  ; UNPACKED-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; UNPACKED-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+  ; UNPACKED-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
   ; UNPACKED-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
   ; UNPACKED-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C]]
   ; UNPACKED-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
@@ -1294,7 +1329,8 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_0000(<8 x i32> inreg %rs
   ; UNPACKED-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32)
   ; UNPACKED-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
   ; UNPACKED-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-  ; UNPACKED-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[C1]], [[SHL]]
+  ; UNPACKED-NEXT:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C]]
+  ; UNPACKED-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]]
   ; UNPACKED-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
   ; UNPACKED-NEXT:   $vgpr0 = COPY [[BITCAST]](<2 x s16>)
   ; UNPACKED-NEXT:   $vgpr1 = COPY [[BITCAST1]](<2 x s16>)
@@ -1321,10 +1357,14 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_0000(<8 x i32> inreg %rs
   ; PACKED-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<2 x s32>)
   ; PACKED-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32)
   ; PACKED-NEXT:   G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; PACKED-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; PACKED-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; PACKED-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C1]](s32)
-  ; PACKED-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]]
+  ; PACKED-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; PACKED-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+  ; PACKED-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+  ; PACKED-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]]
+  ; PACKED-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; PACKED-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; PACKED-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32)
+  ; PACKED-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
   ; PACKED-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
   ; PACKED-NEXT:   $vgpr0 = COPY [[BITCAST]](<2 x s16>)
   ; PACKED-NEXT:   $vgpr1 = COPY [[BITCAST1]](<2 x s16>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll
index adf7e6d38b989b..52030a90ef66e4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll
@@ -44,8 +44,10 @@ define amdgpu_ps <2 x float> @image_load_v2f32(<8 x i32> inreg %rsrc, i32 %s, i3
   ; GCN-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<2 x s32>), addrspace 8)
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
-  ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN-NEXT:   $vgpr1 = COPY [[UV1]](s32)
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
   ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
   %tex = call <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
   ret <2 x float> %tex
@@ -70,9 +72,12 @@ define amdgpu_ps <3 x float> @image_load_v3f32(<8 x i32> inreg %rsrc, i32 %s, i3
   ; GCN-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<3 x s32>), align 16, addrspace 8)
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>)
-  ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN-NEXT:   $vgpr2 = COPY [[UV2]](s32)
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GCN-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
   ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
   %tex = call <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
   ret <3 x float> %tex
@@ -97,10 +102,14 @@ define amdgpu_ps <4 x float> @image_load_v4f32(<8 x i32> inreg %rsrc, i32 %s, i3
   ; GCN-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GCN-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GCN-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   %tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
   ret <4 x float> %tex
@@ -157,8 +166,10 @@ define amdgpu_ps <2 x float> @image_load_tfe_v2f32(<8 x i32> inreg %rsrc, i32 %s
   ; GCN-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (<2 x s32>), addrspace 8)
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>)
   ; GCN-NEXT:   G_STORE [[UV2]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN-NEXT:   $vgpr1 = COPY [[UV1]](s32)
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
   ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
   %res = call { <2 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v2f32i32s.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
   %tex = extractvalue { <2 x float>, i32 } %res, 0
@@ -188,9 +199,12 @@ define amdgpu_ps <3 x float> @image_load_tfe_v3f32(<8 x i32> inreg %rsrc, i32 %s
   ; GCN-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (<3 x s32>), align 16, addrspace 8)
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GCN-NEXT:   G_STORE [[UV3]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN-NEXT:   $vgpr2 = COPY [[UV2]](s32)
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GCN-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
   ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
   %res = call { <3 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f32i32s.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
   %tex = extractvalue { <3 x float>, i32 } %res, 0
@@ -220,10 +234,14 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32(<8 x i32> inreg %rsrc, i32 %s
   ; GCN-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
   ; GCN-NEXT:   G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GCN-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GCN-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   %res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
   %tex = extractvalue { <4 x float>, i32 } %res, 0
@@ -265,8 +283,10 @@ define amdgpu_ps <2 x float> @image_load_v2f32_dmask_1000(<8 x i32> inreg %rsrc,
   ; GCN-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (s32), addrspace 8)
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
-  ; GCN-NEXT:   $vgpr1 = COPY [[DEF]](s32)
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
   ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
   %tex = call <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
   ret <2 x float> %tex
@@ -281,8 +301,10 @@ define amdgpu_ps <2 x float> @image_load_v2f32_dmask_0000(<8 x i32> inreg %rsrc,
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<2 x s32>)
-  ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN-NEXT:   $vgpr1 = COPY [[UV1]](s32)
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY3]](s32)
   ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
   %tex = call <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
   ret <2 x float> %tex
@@ -308,9 +330,12 @@ define amdgpu_ps <3 x float> @image_load_v3f32_dmask_1100(<8 x i32> inreg %rsrc,
   ; GCN-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<2 x s32>), addrspace 8)
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN-NEXT:   $vgpr2 = COPY [[DEF]](s32)
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GCN-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
   ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
   %tex = call <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
   ret <3 x float> %tex
@@ -335,9 +360,12 @@ define amdgpu_ps <3 x float> @image_load_v3f32_dmask_1000(<8 x i32> inreg %rsrc,
   ; GCN-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (s32), addrspace 8)
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
-  ; GCN-NEXT:   $vgpr1 = COPY [[DEF]](s32)
-  ; GCN-NEXT:   $vgpr2 = COPY [[DEF]](s32)
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GCN-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
   ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
   %tex = call <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
   ret <3 x float> %tex
@@ -352,9 +380,12 @@ define amdgpu_ps <3 x float> @image_load_v3f32_dmask_0000(<8 x i32> inreg %rsrc,
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<3 x s32>)
-  ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN-NEXT:   $vgpr2 = COPY [[UV2]](s32)
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY3]](s32)
+  ; GCN-NEXT:   $vgpr2 = COPY [[COPY4]](s32)
   ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
   %tex = call <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
   ret <3 x float> %tex
@@ -380,10 +411,14 @@ define amdgpu_ps <4 x float> @image_load_v4f32_dmask_1110(<8 x i32> inreg %rsrc,
   ; GCN-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<3 x s32>), align 16, addrspace 8)
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>)
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GCN-NEXT:   $vgpr3 = COPY [[DEF]](s32)
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GCN-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   %tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
   ret <4 x float> %tex
@@ -409,10 +444,14 @@ define amdgpu_ps <4 x float> @image_load_v4f32_dmask_1100(<8 x i32> inreg %rsrc,
   ; GCN-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<2 x s32>), addrspace 8)
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN-NEXT:   $vgpr2 = COPY [[DEF]](s32)
-  ; GCN-NEXT:   $vgpr3 = COPY [[DEF]](s32)
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GCN-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   %tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
   ret <4 x float> %tex
@@ -437,10 +476,14 @@ define amdgpu_ps <4 x float> @image_load_v4f32_dmask_1000(<8 x i32> inreg %rsrc,
   ; GCN-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (s32), addrspace 8)
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
-  ; GCN-NEXT:   $vgpr1 = COPY [[DEF]](s32)
-  ; GCN-NEXT:   $vgpr2 = COPY [[DEF]](s32)
-  ; GCN-NEXT:   $vgpr3 = COPY [[DEF]](s32)
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GCN-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   %tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
   ret <4 x float> %tex
@@ -455,10 +498,14 @@ define amdgpu_ps <4 x float> @image_load_v4f32_dmask_0000(<8 x i32> inreg %rsrc,
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<4 x s32>)
-  ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GCN-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY2]](s32)
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY3]](s32)
+  ; GCN-NEXT:   $vgpr2 = COPY [[COPY4]](s32)
+  ; GCN-NEXT:   $vgpr3 = COPY [[COPY5]](s32)
   ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   %tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
   ret <4 x float> %tex
@@ -516,8 +563,10 @@ define amdgpu_ps <2 x float> @image_load_tfe_v2f32_dmask_1000(<8 x i32> inreg %r
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
   ; GCN-NEXT:   G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
-  ; GCN-NEXT:   $vgpr0 = COPY [[UV2]](s32)
-  ; GCN-NEXT:   $vgpr1 = COPY [[UV3]](s32)
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
   ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
   %res = call { <2 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v2f32i32s.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
   %tex = extractvalue { <2 x float>, i32 } %res, 0
@@ -548,8 +597,10 @@ define amdgpu_ps <2 x float> @image_load_tfe_v2f32_dmask_0000(<8 x i32> inreg %r
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
   ; GCN-NEXT:   G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
-  ; GCN-NEXT:   $vgpr0 = COPY [[UV2]](s32)
-  ; GCN-NEXT:   $vgpr1 = COPY [[UV3]](s32)
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
   ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
   %res = call { <2 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v2f32i32s.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
   %tex = extractvalue { <2 x float>, i32 } %res, 0
@@ -580,9 +631,12 @@ define amdgpu_ps <3 x float> @image_load_tfe_v3f32_dmask_1100(<8 x i32> inreg %r
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>)
   ; GCN-NEXT:   G_STORE [[UV2]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>)
-  ; GCN-NEXT:   $vgpr0 = COPY [[UV3]](s32)
-  ; GCN-NEXT:   $vgpr1 = COPY [[UV4]](s32)
-  ; GCN-NEXT:   $vgpr2 = COPY [[UV5]](s32)
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV4]](s32)
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV5]](s32)
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GCN-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
   ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
   %res = call { <3 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f32i32s.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
   %tex = extractvalue { <3 x float>, i32 } %res, 0
@@ -613,9 +667,12 @@ define amdgpu_ps <3 x float> @image_load_tfe_v3f32_dmask_1000(<8 x i32> inreg %r
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
   ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GCN-NEXT:   G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN-NEXT:   $vgpr1 = COPY [[DEF1]](s32)
-  ; GCN-NEXT:   $vgpr2 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GCN-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
   ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
   %res = call { <3 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f32i32s.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
   %tex = extractvalue { <3 x float>, i32 } %res, 0
@@ -646,9 +703,12 @@ define amdgpu_ps <3 x float> @image_load_tfe_v3f32_dmask_0000(<8 x i32> inreg %r
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
   ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GCN-NEXT:   G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN-NEXT:   $vgpr1 = COPY [[DEF1]](s32)
-  ; GCN-NEXT:   $vgpr2 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GCN-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
   ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
   %res = call { <3 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f32i32s.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
   %tex = extractvalue { <3 x float>, i32 } %res, 0
@@ -679,10 +739,14 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_1110(<8 x i32> inreg %r
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GCN-NEXT:   G_STORE [[UV3]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GCN-NEXT:   $vgpr0 = COPY [[UV4]](s32)
-  ; GCN-NEXT:   $vgpr1 = COPY [[UV5]](s32)
-  ; GCN-NEXT:   $vgpr2 = COPY [[UV6]](s32)
-  ; GCN-NEXT:   $vgpr3 = COPY [[UV7]](s32)
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV4]](s32)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV5]](s32)
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV6]](s32)
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV7]](s32)
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GCN-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   %res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
   %tex = extractvalue { <4 x float>, i32 } %res, 0
@@ -713,10 +777,14 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_1100(<8 x i32> inreg %r
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>)
   ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GCN-NEXT:   G_STORE [[UV2]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN-NEXT:   $vgpr2 = COPY [[DEF1]](s32)
-  ; GCN-NEXT:   $vgpr3 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GCN-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   %res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
   %tex = extractvalue { <4 x float>, i32 } %res, 0
@@ -747,10 +815,14 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_1000(<8 x i32> inreg %r
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
   ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GCN-NEXT:   G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN-NEXT:   $vgpr1 = COPY [[DEF1]](s32)
-  ; GCN-NEXT:   $vgpr2 = COPY [[DEF1]](s32)
-  ; GCN-NEXT:   $vgpr3 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GCN-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   %res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
   %tex = extractvalue { <4 x float>, i32 } %res, 0
@@ -781,10 +853,14 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_0000(<8 x i32> inreg %r
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
   ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GCN-NEXT:   G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN-NEXT:   $vgpr1 = COPY [[DEF1]](s32)
-  ; GCN-NEXT:   $vgpr2 = COPY [[DEF1]](s32)
-  ; GCN-NEXT:   $vgpr3 = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY10]](s32)
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY11]](s32)
+  ; GCN-NEXT:   $vgpr2 = COPY [[COPY12]](s32)
+  ; GCN-NEXT:   $vgpr3 = COPY [[COPY13]](s32)
   ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   %res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
   %tex = extractvalue { <4 x float>, i32 } %res, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll
index 4d36e0f7970167..3d90783b5cf69f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll
@@ -25,10 +25,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i3
   ; GFX6-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX6-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[BUILD_VECTOR1]](<4 x s32>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX6-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX6-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX6-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX6-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX6-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX6-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX6-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX6-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX6-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX6-NEXT:   $vgpr0 = COPY [[COPY12]](s32)
+  ; GFX6-NEXT:   $vgpr1 = COPY [[COPY13]](s32)
+  ; GFX6-NEXT:   $vgpr2 = COPY [[COPY14]](s32)
+  ; GFX6-NEXT:   $vgpr3 = COPY [[COPY15]](s32)
   ; GFX6-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: load_2darraymsaa
@@ -50,10 +54,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i3
   ; GFX10NSA-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY12]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY13]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY14]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY15]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
   ret <4 x float> %v
@@ -84,10 +92,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr ad
   ; GFX6-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[BUILD_VECTOR1]](<4 x s32>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX6-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
   ; GFX6-NEXT:   G_STORE [[UV4]](s32), [[MV]](p1) :: (store (s32) into %ir.out, addrspace 1)
-  ; GFX6-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX6-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX6-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX6-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX6-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX6-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX6-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX6-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX6-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX6-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX6-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX6-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX6-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10NSA-LABEL: name: load_2darraymsaa_tfe
@@ -113,10 +125,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr ad
   ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
   ; GFX10NSA-NEXT:   G_STORE [[UV4]](s32), [[MV]](p1) :: (store (s32) into %ir.out, addrspace 1)
-  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10NSA-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10NSA-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10NSA-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10NSA-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX10NSA-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX10NSA-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX10NSA-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
   %v.vec = extractvalue { <4 x float>, i32 } %v, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll
index 5b017ad89a0ed3..f0585516446840 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll
@@ -29,10 +29,14 @@ define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
   ; GFX9-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY13]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY14]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY15]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY16]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_1d
@@ -59,10 +63,14 @@ define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
   ; GFX10-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY13]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY14]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY15]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY16]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_1d
@@ -89,10 +97,14 @@ define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
   ; GFX11-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY13]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY14]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY15]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY16]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_1d
@@ -119,10 +131,14 @@ define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
   ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY13]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY14]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY15]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY16]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -155,10 +171,14 @@ define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
   ; GFX9-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_2d
@@ -186,10 +206,14 @@ define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
   ; GFX10-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_2d
@@ -217,10 +241,14 @@ define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
   ; GFX11-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_2d
@@ -248,10 +276,14 @@ define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
   ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -289,10 +321,14 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_3d
@@ -325,10 +361,14 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
   ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_3d
@@ -361,10 +401,14 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
   ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_3d
@@ -396,10 +440,14 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
   ; GFX12-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.3d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16(i32 15, half %s, half %t, half %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -437,10 +485,14 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_cube
@@ -473,10 +525,14 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_cube
@@ -509,10 +565,14 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_cube
@@ -544,10 +604,14 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX12-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cube), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f16(i32 15, half %s, half %t, half %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -580,10 +644,14 @@ define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX9-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1darray), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_1darray
@@ -611,10 +679,14 @@ define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1darray), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_1darray
@@ -642,10 +714,14 @@ define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1darray), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_1darray
@@ -673,10 +749,14 @@ define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1darray), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16(i32 15, half %s, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -714,10 +794,14 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_2darray
@@ -750,10 +834,14 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_2darray
@@ -786,10 +874,14 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_2darray
@@ -821,10 +913,14 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX12-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2darray), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16(i32 15, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -859,10 +955,14 @@ define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_c_1d
@@ -892,10 +992,14 @@ define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_1d
@@ -925,10 +1029,14 @@ define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_1d
@@ -957,10 +1065,14 @@ define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f16(i32 15, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -996,10 +1108,14 @@ define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_c_2d
@@ -1030,10 +1146,14 @@ define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_2d
@@ -1064,10 +1184,14 @@ define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_2d
@@ -1097,10 +1221,14 @@ define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1133,10 +1261,14 @@ define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX9-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_cl_1d
@@ -1164,10 +1296,14 @@ define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX10-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_cl_1d
@@ -1195,10 +1331,14 @@ define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX11-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_cl_1d
@@ -1226,10 +1366,14 @@ define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f16(i32 15, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1267,10 +1411,14 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_cl_2d
@@ -1303,10 +1451,14 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_cl_2d
@@ -1339,10 +1491,14 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_cl_2d
@@ -1374,10 +1530,14 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX12-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f16(i32 15, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1413,10 +1573,14 @@ define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_c_cl_1d
@@ -1447,10 +1611,14 @@ define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_cl_1d
@@ -1481,10 +1649,14 @@ define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_cl_1d
@@ -1514,10 +1686,14 @@ define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f16(i32 15, float %zcompare, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1557,10 +1733,14 @@ define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_c_cl_2d
@@ -1594,10 +1774,14 @@ define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_cl_2d
@@ -1631,10 +1815,14 @@ define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_cl_2d
@@ -1668,10 +1856,14 @@ define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX12-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1707,10 +1899,14 @@ define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_b_1d
@@ -1741,10 +1937,14 @@ define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_b_1d
@@ -1775,10 +1975,14 @@ define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_b_1d
@@ -1808,10 +2012,14 @@ define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX12-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f16.f16(i32 15, half %bias, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1849,10 +2057,14 @@ define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_b_2d
@@ -1885,10 +2097,14 @@ define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_b_2d
@@ -1921,10 +2137,14 @@ define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_b_2d
@@ -1956,10 +2176,14 @@ define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX12-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f16.f16(i32 15, half %bias, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1997,10 +2221,14 @@ define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_c_b_1d
@@ -2032,10 +2260,14 @@ define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX10-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_b_1d
@@ -2067,10 +2299,14 @@ define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX11-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_b_1d
@@ -2102,10 +2338,14 @@ define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX12-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -2145,10 +2385,14 @@ define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_c_b_2d
@@ -2182,10 +2426,14 @@ define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX10-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_b_2d
@@ -2219,10 +2467,14 @@ define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX11-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_b_2d
@@ -2256,10 +2508,14 @@ define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX12-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -2297,10 +2553,14 @@ define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_b_cl_1d
@@ -2333,10 +2593,14 @@ define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_b_cl_1d
@@ -2369,10 +2633,14 @@ define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_b_cl_1d
@@ -2404,10 +2672,14 @@ define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX12-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f16.f16(i32 15, half %bias, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -2448,10 +2720,14 @@ define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_b_cl_2d
@@ -2486,10 +2762,14 @@ define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[DEF]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_b_cl_2d
@@ -2524,10 +2804,14 @@ define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[DEF]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_b_cl_2d
@@ -2562,10 +2846,14 @@ define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX12-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f16.f16(i32 15, half %bias, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -2605,10 +2893,14 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_c_b_cl_1d
@@ -2642,10 +2934,14 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_b_cl_1d
@@ -2679,10 +2975,14 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX11-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_b_cl_1d
@@ -2716,10 +3016,14 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX12-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -2762,10 +3066,14 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.2d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY17]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY18]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY19]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY20]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_c_b_cl_2d
@@ -2802,10 +3110,14 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[DEF]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY18]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY19]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY20]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_b_cl_2d
@@ -2842,10 +3154,14 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX11-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[DEF]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY18]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY19]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY20]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_b_cl_2d
@@ -2882,10 +3198,14 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX12-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY18]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY19]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY20]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -2924,10 +3244,14 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_d_1d
@@ -2960,10 +3284,14 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_d_1d
@@ -2996,10 +3324,14 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_d_1d
@@ -3032,10 +3364,14 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX12-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -3079,10 +3415,14 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY18]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY19]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY20]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY21]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_d_2d
@@ -3120,10 +3460,14 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY18]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY19]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY20]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY21]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_d_2d
@@ -3161,10 +3505,14 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY18]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY19]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY20]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY21]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_d_2d
@@ -3202,10 +3550,14 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX12-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY18]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY19]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY20]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY21]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -3259,10 +3611,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY21]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY22]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY23]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY24]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_d_3d
@@ -3311,10 +3667,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY21]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY22]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY23]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY24]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_d_3d
@@ -3363,10 +3723,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY21]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY22]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY23]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY24]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_d_3d
@@ -3415,10 +3779,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX12-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY21]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY22]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY23]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY24]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, half %s, half %t, half %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -3459,10 +3827,14 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_c_d_1d
@@ -3497,10 +3869,14 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX10-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_d_1d
@@ -3535,10 +3911,14 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX11-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_d_1d
@@ -3573,10 +3953,14 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX12-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -3622,10 +4006,14 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_c_d_2d
@@ -3665,10 +4053,14 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX10-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_d_2d
@@ -3708,10 +4100,14 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX11-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_d_2d
@@ -3751,10 +4147,14 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX12-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -3795,10 +4195,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_d_cl_1d
@@ -3833,10 +4237,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_d_cl_1d
@@ -3871,10 +4279,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_d_cl_1d
@@ -3909,10 +4321,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX12-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -3960,10 +4376,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_d_cl_2d
@@ -4005,10 +4425,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_d_cl_2d
@@ -4050,10 +4474,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_d_cl_2d
@@ -4095,10 +4523,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX12-NEXT:   [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -4141,10 +4573,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY17]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY18]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY19]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY20]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_c_d_cl_1d
@@ -4181,10 +4617,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY18]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY19]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY20]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_d_cl_1d
@@ -4221,10 +4661,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX11-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY18]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY19]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY20]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_d_cl_1d
@@ -4261,10 +4705,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX12-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY18]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY19]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY20]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -4314,10 +4762,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<10 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[CONCAT_VECTORS]](<10 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY20]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY21]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY22]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY23]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_c_d_cl_2d
@@ -4361,10 +4813,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10-NEXT:   [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY20]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY21]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY22]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY23]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_d_cl_2d
@@ -4408,10 +4864,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX11-NEXT:   [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY20]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY21]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY22]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY23]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_d_cl_2d
@@ -4456,10 +4916,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX12-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY20]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY21]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY22]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY23]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -4498,10 +4962,14 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_cd_1d
@@ -4534,10 +5002,14 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX10-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_cd_1d
@@ -4570,10 +5042,14 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX11-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_cd_1d
@@ -4606,10 +5082,14 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX12-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -4653,10 +5133,14 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY18]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY19]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY20]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY21]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_cd_2d
@@ -4694,10 +5178,14 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX10-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY18]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY19]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY20]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY21]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_cd_2d
@@ -4735,10 +5223,14 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX11-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY18]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY19]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY20]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY21]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_cd_2d
@@ -4776,10 +5268,14 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX12-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY18]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY19]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY20]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY21]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -4820,10 +5316,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_c_cd_1d
@@ -4858,10 +5358,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_cd_1d
@@ -4896,10 +5400,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_cd_1d
@@ -4934,10 +5442,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX12-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -4983,10 +5495,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_c_cd_2d
@@ -5026,10 +5542,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_cd_2d
@@ -5069,10 +5589,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_cd_2d
@@ -5112,10 +5636,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX12-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -5156,10 +5684,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_cd_cl_1d
@@ -5194,10 +5726,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX10-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_cd_cl_1d
@@ -5232,10 +5768,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX11-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_cd_cl_1d
@@ -5270,10 +5810,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX12-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -5321,10 +5865,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_cd_cl_2d
@@ -5366,10 +5914,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX10-NEXT:   [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_cd_cl_2d
@@ -5411,10 +5963,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX11-NEXT:   [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_cd_cl_2d
@@ -5456,10 +6012,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX12-NEXT:   [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -5502,10 +6062,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY17]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY18]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY19]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY20]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_c_cd_cl_1d
@@ -5542,10 +6106,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY18]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY19]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY20]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_cd_cl_1d
@@ -5582,10 +6150,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX11-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY18]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY19]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY20]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_cd_cl_1d
@@ -5622,10 +6194,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX12-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY18]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY19]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY20]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -5675,10 +6251,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<10 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[CONCAT_VECTORS]](<10 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY20]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY21]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY22]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY23]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_c_cd_cl_2d
@@ -5722,10 +6302,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10-NEXT:   [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY20]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY21]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY22]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY23]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_cd_cl_2d
@@ -5769,10 +6353,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX11-NEXT:   [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY20]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY21]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY22]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY23]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_cd_cl_2d
@@ -5817,10 +6405,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX12-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY20]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY21]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY22]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY23]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -5853,10 +6445,14 @@ define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX9-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_l_1d
@@ -5884,10 +6480,14 @@ define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_l_1d
@@ -5915,10 +6515,14 @@ define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_l_1d
@@ -5946,10 +6550,14 @@ define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f16(i32 15, half %s, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -5987,10 +6595,14 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_l_2d
@@ -6023,10 +6635,14 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_l_2d
@@ -6059,10 +6675,14 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_l_2d
@@ -6094,10 +6714,14 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX12-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f16(i32 15, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -6133,10 +6757,14 @@ define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_c_l_1d
@@ -6167,10 +6795,14 @@ define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_l_1d
@@ -6201,10 +6833,14 @@ define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_l_1d
@@ -6234,10 +6870,14 @@ define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f16(i32 15, float %zcompare, half %s, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -6277,10 +6917,14 @@ define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_c_l_2d
@@ -6314,10 +6958,14 @@ define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX10-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_l_2d
@@ -6351,10 +6999,14 @@ define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX11-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_l_2d
@@ -6388,10 +7040,14 @@ define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX12-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -6423,10 +7079,14 @@ define amdgpu_ps <4 x float> @sample_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX9-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY13]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY14]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY15]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY16]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_lz_1d
@@ -6453,10 +7113,14 @@ define amdgpu_ps <4 x float> @sample_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX10-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY13]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY14]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY15]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY16]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_lz_1d
@@ -6483,10 +7147,14 @@ define amdgpu_ps <4 x float> @sample_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX11-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY13]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY14]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY15]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY16]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_lz_1d
@@ -6513,10 +7181,14 @@ define amdgpu_ps <4 x float> @sample_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY13]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY14]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY15]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY16]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f16(i32 15, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -6549,10 +7221,14 @@ define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX9-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_lz_2d
@@ -6580,10 +7256,14 @@ define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX10-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_lz_2d
@@ -6611,10 +7291,14 @@ define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX11-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_lz_2d
@@ -6642,10 +7326,14 @@ define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f16(i32 15, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -6680,10 +7368,14 @@ define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_c_lz_1d
@@ -6713,10 +7405,14 @@ define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_lz_1d
@@ -6746,10 +7442,14 @@ define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_lz_1d
@@ -6778,10 +7478,14 @@ define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY14]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY15]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY17]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f16(i32 15, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -6817,10 +7521,14 @@ define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX10-LABEL: name: sample_c_lz_2d
@@ -6851,10 +7559,14 @@ define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_lz_2d
@@ -6885,10 +7597,14 @@ define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_lz_2d
@@ -6918,10 +7634,14 @@ define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -7162,8 +7882,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>)
   ; GFX9-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<2 x s32>), addrspace 8)
   ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY21]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY22]](s32)
   ; GFX9-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
   ;
   ; GFX10-LABEL: name: sample_c_d_o_2darray_V2
@@ -7210,8 +7932,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
   ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<2 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY21]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY22]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
   ;
   ; GFX11-LABEL: name: sample_c_d_o_2darray_V2
@@ -7258,8 +7982,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
   ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<2 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY21]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY22]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
   ;
   ; GFX12-LABEL: name: sample_c_d_o_2darray_V2
@@ -7306,8 +8032,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
   ; GFX12-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<2 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY21]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY22]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
 main_body:
   %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f16(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.d.ll
index 241170b94318a5..d7c1c7a6bef5ec 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.d.ll
@@ -34,10 +34,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<9 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR2]](<9 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY21]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY22]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY23]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY24]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_d_3d
@@ -70,10 +74,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR2]](<5 x s32>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY21]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY22]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY23]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY24]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_d_3d
@@ -106,10 +114,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<6 x s32>) = G_BUILD_VECTOR [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[BUILD_VECTOR2]](<6 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY21]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY22]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY23]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY24]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -148,10 +160,14 @@ define amdgpu_ps <4 x float> @sample_c_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX10-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<10 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.3d), 15, [[BUILD_VECTOR2]](<10 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY22]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY23]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY24]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY25]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_d_3d
@@ -185,10 +201,14 @@ define amdgpu_ps <4 x float> @sample_c_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX11-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<6 x s32>) = G_BUILD_VECTOR [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR2]](<6 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY22]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY23]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY24]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY25]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_d_3d
@@ -222,10 +242,14 @@ define amdgpu_ps <4 x float> @sample_c_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<7 x s32>) = G_BUILD_VECTOR [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[BUILD_VECTOR2]](<7 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY22]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY23]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY24]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY25]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.3d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -265,10 +289,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_3d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<11 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.3d), 15, [[BUILD_VECTOR2]](<11 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY23]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY24]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY25]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY26]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_d_cl_3d
@@ -303,10 +331,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_3d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX11-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<7 x s32>) = G_BUILD_VECTOR [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR2]](<7 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY23]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY24]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY25]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY26]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_d_cl_3d
@@ -341,10 +373,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_3d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[BUILD_VECTOR2]](<8 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY23]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY24]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY25]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY26]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.3d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, float %s, float %t, float %r, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -385,10 +421,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_o_3d(<8 x i32> inreg %rsrc, <4 x i32
   ; GFX10-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<12 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.o.3d), 15, [[BUILD_VECTOR2]](<12 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY27:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY24]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY25]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY26]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY27]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_d_cl_o_3d
@@ -424,10 +464,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_o_3d(<8 x i32> inreg %rsrc, <4 x i32
   ; GFX11-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.o.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR2]](<8 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY27:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY24]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY25]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY26]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY27]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_d_cl_o_3d
@@ -463,10 +507,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_o_3d(<8 x i32> inreg %rsrc, <4 x i32
   ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<9 x s32>) = G_BUILD_VECTOR [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.o.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[BUILD_VECTOR2]](<9 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY26:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY27:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY24]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY25]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY26]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY27]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.3d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, float %s, float %t, float %r, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.a16.ll
index f05b258c974d1d..477965ab8981b6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.a16.ll
@@ -34,10 +34,14 @@ define amdgpu_ps <4 x float> @sample_d_1d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
   ; GFX10-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_d_1d_g16_a16
@@ -70,10 +74,14 @@ define amdgpu_ps <4 x float> @sample_d_1d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
   ; GFX11-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_d_1d_g16_a16
@@ -106,10 +114,14 @@ define amdgpu_ps <4 x float> @sample_d_1d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
   ; GFX12-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -152,10 +164,14 @@ define amdgpu_ps <4 x float> @sample_d_2d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
   ; GFX10-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY18]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY19]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY20]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY21]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_d_2d_g16_a16
@@ -193,10 +209,14 @@ define amdgpu_ps <4 x float> @sample_d_2d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
   ; GFX11-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY18]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY19]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY20]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY21]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_d_2d_g16_a16
@@ -234,10 +254,14 @@ define amdgpu_ps <4 x float> @sample_d_2d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
   ; GFX12-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY18]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY19]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY20]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY21]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -291,10 +315,14 @@ define amdgpu_ps <4 x float> @sample_d_3d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
   ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY21]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY22]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY23]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY24]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_d_3d_g16_a16
@@ -343,10 +371,14 @@ define amdgpu_ps <4 x float> @sample_d_3d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
   ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY21]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY22]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY23]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY24]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_d_3d_g16_a16
@@ -395,10 +427,14 @@ define amdgpu_ps <4 x float> @sample_d_3d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
   ; GFX12-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY21]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY22]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY23]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY24]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, half %s, half %t, half %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll
index cc2a8ba9c4d5d9..e78a9897be9c5a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll
@@ -33,10 +33,14 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_d_1d
@@ -68,10 +72,14 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_d_1d
@@ -103,10 +111,14 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX12-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -148,10 +160,14 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY18]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY19]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY20]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY21]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_d_2d
@@ -188,10 +204,14 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY18]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY19]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY20]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY21]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_d_2d
@@ -228,10 +248,14 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX12-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY18]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY19]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY20]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY21]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -283,10 +307,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[CONCAT_VECTORS]](<14 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY21]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY22]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY23]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY24]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_d_3d
@@ -333,10 +361,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY21]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY22]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY23]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY24]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_d_3d
@@ -383,10 +415,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX12-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR5]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY21]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY22]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY23]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY24]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -425,10 +461,14 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX10-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_d_1d
@@ -462,10 +502,14 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX11-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_d_1d
@@ -499,10 +543,14 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX12-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -546,10 +594,14 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX10-NEXT:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_d_2d
@@ -588,10 +640,14 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX11-NEXT:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_d_2d
@@ -631,10 +687,14 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX12-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -673,10 +733,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_d_cl_1d
@@ -710,10 +774,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_d_cl_1d
@@ -747,10 +815,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX12-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -794,10 +866,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_d_cl_2d
@@ -836,10 +912,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_d_cl_2d
@@ -879,10 +959,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX12-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -923,10 +1007,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10-NEXT:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY18]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY19]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY20]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_d_cl_1d
@@ -962,10 +1050,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX11-NEXT:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY18]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY19]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY20]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_d_cl_1d
@@ -1002,10 +1094,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX12-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY18]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY19]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY20]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1052,10 +1148,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY20]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY21]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY22]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY23]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_d_cl_2d
@@ -1097,10 +1197,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY20]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY21]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY22]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY23]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_d_cl_2d
@@ -1142,10 +1246,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX12-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY20]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY21]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY22]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY23]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1182,10 +1290,14 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX10-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_cd_1d
@@ -1217,10 +1329,14 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX11-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_cd_1d
@@ -1252,10 +1368,14 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX12-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY15]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY18]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1297,10 +1417,14 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX10-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY18]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY19]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY20]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY21]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_cd_2d
@@ -1337,10 +1461,14 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX11-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY18]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY19]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY20]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY21]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_cd_2d
@@ -1377,10 +1505,14 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX12-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY18]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY19]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY20]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY21]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1419,10 +1551,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_cd_1d
@@ -1456,10 +1592,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_cd_1d
@@ -1493,10 +1633,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX12-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1540,10 +1684,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_cd_2d
@@ -1582,10 +1730,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_cd_2d
@@ -1625,10 +1777,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX12-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1667,10 +1823,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX10-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_cd_cl_1d
@@ -1704,10 +1864,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX11-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_cd_cl_1d
@@ -1741,10 +1905,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX12-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY16]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY18]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY19]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1788,10 +1956,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX10-NEXT:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_cd_cl_2d
@@ -1830,10 +2002,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX11-NEXT:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_cd_cl_2d
@@ -1873,10 +2049,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX12-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY19]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY20]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY21]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY22]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1917,10 +2097,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10-NEXT:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY17]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY18]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY19]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY20]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_cd_cl_1d
@@ -1956,10 +2140,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX11-NEXT:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY17]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY18]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY19]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY20]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_cd_cl_1d
@@ -1996,10 +2184,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX12-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY17]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY18]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY19]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY20]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -2046,10 +2238,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY20]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY21]](s32)
+  ; GFX10-NEXT:   $vgpr2 = COPY [[COPY22]](s32)
+  ; GFX10-NEXT:   $vgpr3 = COPY [[COPY23]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX11-LABEL: name: sample_c_cd_cl_2d
@@ -2091,10 +2287,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY20]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY21]](s32)
+  ; GFX11-NEXT:   $vgpr2 = COPY [[COPY22]](s32)
+  ; GFX11-NEXT:   $vgpr3 = COPY [[COPY23]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   ;
   ; GFX12-LABEL: name: sample_c_cd_cl_2d
@@ -2136,10 +2336,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX12-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX12-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX12-NEXT:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY20]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY21]](s32)
+  ; GFX12-NEXT:   $vgpr2 = COPY [[COPY22]](s32)
+  ; GFX12-NEXT:   $vgpr3 = COPY [[COPY23]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -2322,8 +2526,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
   ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[CONCAT_VECTORS]](<14 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<2 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
-  ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY21]](s32)
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY22]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
   ;
   ; GFX11-LABEL: name: sample_c_d_o_2darray_V2
@@ -2367,8 +2573,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
   ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<2 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
-  ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX11-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   $vgpr0 = COPY [[COPY21]](s32)
+  ; GFX11-NEXT:   $vgpr1 = COPY [[COPY22]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
   ;
   ; GFX12-LABEL: name: sample_c_d_o_2darray_V2
@@ -2412,8 +2620,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
   ; GFX12-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
   ; GFX12-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<2 x s32>), addrspace 8)
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
-  ; GFX12-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX12-NEXT:   $vgpr1 = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[COPY21]](s32)
+  ; GFX12-NEXT:   $vgpr1 = COPY [[COPY22]](s32)
   ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
 main_body:
   %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll
index 12234088adca65..67ff69a70c1ce4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll
@@ -242,12 +242,23 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
   ; UNPACKED-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; UNPACKED-NEXT:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
   ; UNPACKED-NEXT:   [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-  ; UNPACKED-NEXT:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>)
-  ; UNPACKED-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; UNPACKED-NEXT:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-  ; UNPACKED-NEXT:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY11]](<2 x s16>)
+  ; UNPACKED-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>)
+  ; UNPACKED-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; UNPACKED-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+  ; UNPACKED-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>)
+  ; UNPACKED-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; UNPACKED-NEXT:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY12]], [[C]](s32)
+  ; UNPACKED-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+  ; UNPACKED-NEXT:   [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST1]](<2 x s32>)
+  ; UNPACKED-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; UNPACKED-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; UNPACKED-NEXT:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY13]], [[C1]](s32)
+  ; UNPACKED-NEXT:   [[BITCAST2:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+  ; UNPACKED-NEXT:   [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST2]](<2 x s32>)
+  ; UNPACKED-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV5]](s32)
+  ; UNPACKED-NEXT:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY14]], [[C]](s32)
   ; UNPACKED-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
-  ; UNPACKED-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST]](s32), [[LSHR]](s32), [[BITCAST1]](s32)
+  ; UNPACKED-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LSHR]](s32), [[LSHR1]](s32), [[LSHR2]](s32)
   ; UNPACKED-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<3 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8)
   ; UNPACKED-NEXT:   S_ENDPGM 0
   ;
@@ -268,26 +279,37 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
   ; GFX81-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX81-NEXT:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
   ; GFX81-NEXT:   [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-  ; GFX81-NEXT:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>)
-  ; GFX81-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX81-NEXT:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-  ; GFX81-NEXT:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY11]](<2 x s16>)
+  ; GFX81-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>)
+  ; GFX81-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX81-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+  ; GFX81-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>)
+  ; GFX81-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX81-NEXT:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY12]], [[C]](s32)
+  ; GFX81-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+  ; GFX81-NEXT:   [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST1]](<2 x s32>)
+  ; GFX81-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX81-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX81-NEXT:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY13]], [[C1]](s32)
+  ; GFX81-NEXT:   [[BITCAST2:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+  ; GFX81-NEXT:   [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST2]](<2 x s32>)
+  ; GFX81-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV5]](s32)
+  ; GFX81-NEXT:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY14]], [[C]](s32)
   ; GFX81-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
-  ; GFX81-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-  ; GFX81-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
-  ; GFX81-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
+  ; GFX81-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+  ; GFX81-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
+  ; GFX81-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C1]](s32)
   ; GFX81-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
-  ; GFX81-NEXT:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-  ; GFX81-NEXT:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-  ; GFX81-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; GFX81-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32)
+  ; GFX81-NEXT:   [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+  ; GFX81-NEXT:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C2]]
+  ; GFX81-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+  ; GFX81-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C1]](s32)
   ; GFX81-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
-  ; GFX81-NEXT:   [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-  ; GFX81-NEXT:   [[OR2:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]]
-  ; GFX81-NEXT:   [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
-  ; GFX81-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
-  ; GFX81-NEXT:   [[BITCAST5:%[0-9]+]]:_(<3 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>)
-  ; GFX81-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[BITCAST5]](<3 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8)
+  ; GFX81-NEXT:   [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
+  ; GFX81-NEXT:   [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL1]]
+  ; GFX81-NEXT:   [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
+  ; GFX81-NEXT:   [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
+  ; GFX81-NEXT:   [[BITCAST6:%[0-9]+]]:_(<3 x s32>) = G_BITCAST [[CONCAT_VECTORS1]](<6 x s16>)
+  ; GFX81-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[BITCAST6]](<3 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8)
   ; GFX81-NEXT:   S_ENDPGM 0
   ;
   ; GFX9-LABEL: name: image_store_v3f16
@@ -308,8 +330,29 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
   ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
   ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
   ; GFX9-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>)
+  ; GFX9-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX9-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+  ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>)
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX9-NEXT:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY12]], [[C]](s32)
+  ; GFX9-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+  ; GFX9-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+  ; GFX9-NEXT:   [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST1]](<2 x s32>)
+  ; GFX9-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX9-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9-NEXT:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY13]], [[C1]](s32)
+  ; GFX9-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+  ; GFX9-NEXT:   [[BITCAST2:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+  ; GFX9-NEXT:   [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST2]](<2 x s32>)
+  ; GFX9-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV5]](s32)
+  ; GFX9-NEXT:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY14]], [[C]](s32)
+  ; GFX9-NEXT:   [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
   ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
-  ; GFX9-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8)
+  ; GFX9-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+  ; GFX9-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+  ; GFX9-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
+  ; GFX9-NEXT:   [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
+  ; GFX9-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS1]](<4 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8)
   ; GFX9-NEXT:   S_ENDPGM 0
   ;
   ; GFX10-LABEL: name: image_store_v3f16
@@ -330,8 +373,29 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
   ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
   ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
   ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>)
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+  ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>)
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX10-NEXT:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY12]], [[C]](s32)
+  ; GFX10-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+  ; GFX10-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+  ; GFX10-NEXT:   [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST1]](<2 x s32>)
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10-NEXT:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY13]], [[C1]](s32)
+  ; GFX10-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+  ; GFX10-NEXT:   [[BITCAST2:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+  ; GFX10-NEXT:   [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST2]](<2 x s32>)
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV5]](s32)
+  ; GFX10-NEXT:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY14]], [[C]](s32)
+  ; GFX10-NEXT:   [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
   ; GFX10-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
-  ; GFX10-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8)
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+  ; GFX10-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+  ; GFX10-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
+  ; GFX10-NEXT:   [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
+  ; GFX10-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS1]](<4 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8)
   ; GFX10-NEXT:   S_ENDPGM 0
   ;
   ; GFX12-LABEL: name: image_store_v3f16
@@ -352,7 +416,28 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
   ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
   ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
   ; GFX12-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>)
-  ; GFX12-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 7, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8)
+  ; GFX12-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX12-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+  ; GFX12-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>)
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; GFX12-NEXT:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY12]], [[C]](s32)
+  ; GFX12-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+  ; GFX12-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+  ; GFX12-NEXT:   [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST1]](<2 x s32>)
+  ; GFX12-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+  ; GFX12-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX12-NEXT:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY13]], [[C1]](s32)
+  ; GFX12-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+  ; GFX12-NEXT:   [[BITCAST2:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+  ; GFX12-NEXT:   [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST2]](<2 x s32>)
+  ; GFX12-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV5]](s32)
+  ; GFX12-NEXT:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY14]], [[C]](s32)
+  ; GFX12-NEXT:   [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
+  ; GFX12-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+  ; GFX12-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+  ; GFX12-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
+  ; GFX12-NEXT:   [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+  ; GFX12-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS1]](<4 x s16>), 7, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8)
   ; GFX12-NEXT:   S_ENDPGM 0
   call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
index 41e915a4c1011b..7c2c61deca375f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
@@ -375,10 +375,15 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8>  %arg) {
 define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) {
 ; GFX6-LABEL: abs_sgpr_v2i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX6-NEXT:    s_sext_i32_i16 s1, s1
-; GFX6-NEXT:    s_abs_i32 s0, s0
+; GFX6-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX6-NEXT:    s_abs_i32 s1, s1
+; GFX6-NEXT:    s_abs_i32 s0, s0
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: abs_sgpr_v2i16:
@@ -415,6 +420,11 @@ define amdgpu_cs <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) {
 ; GFX6-NEXT:    v_max_i32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0, v1
 ; GFX6-NEXT:    v_max_i32_e32 v1, v1, v2
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX6-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-legalize-range-metadata.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-legalize-range-metadata.ll
index b6b4301dadc7a5..9c2ac009e44a31 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-legalize-range-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-legalize-range-metadata.ll
@@ -14,16 +14,18 @@ define <4 x i8> @global_load_v4i8_align4__rangemd(ptr addrspace(1) %ptr) {
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p1) :: (load (s32) from %ir.ptr, addrspace 1)
-  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; CHECK-NEXT:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
-  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
   ; CHECK-NEXT:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
-  ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+  ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
   ; CHECK-NEXT:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C2]](s32)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[LOAD]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[LSHR]](s32)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[LSHR1]](s32)
-  ; CHECK-NEXT:   $vgpr3 = COPY [[LSHR2]](s32)
+  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+  ; CHECK-NEXT:   [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C3]](s32)
+  ; CHECK-NEXT:   $vgpr0 = COPY [[LSHR]](s32)
+  ; CHECK-NEXT:   $vgpr1 = COPY [[LSHR1]](s32)
+  ; CHECK-NEXT:   $vgpr2 = COPY [[LSHR2]](s32)
+  ; CHECK-NEXT:   $vgpr3 = COPY [[LSHR3]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   %load = load <4 x i8>, ptr addrspace(1) %ptr, align 4, !range !0, !noundef !1
   ret <4 x i8> %load
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 5dd4fa0809131f..d2793000a31e2e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -712,6 +712,9 @@ define <2 x i16> @v_lshr_v2i16(<2 x i16> %value, <2 x i16> %amount) {
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_v2i16:
@@ -741,8 +744,11 @@ define <2 x i16> @v_lshr_v2i16_15(<2 x i16> %value) {
 ; GFX6-LABEL: v_lshr_v2i16_15:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_bfe_u32 v0, v0, 15, 1
 ; GFX6-NEXT:    v_bfe_u32 v1, v1, 15, 1
+; GFX6-NEXT:    v_bfe_u32 v0, v0, 15, 1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_v2i16_15:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
index f426fb8954ed26..4d400d53916f16 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
@@ -2115,7 +2115,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %
   ; GFX7-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
   ; GFX7-NEXT: {{  $}}
   ; GFX7-NEXT: bb.2:
-  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
+  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
   ; GFX7-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2172,7 +2172,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %
   ; GFX12-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
   ; GFX12-NEXT: {{  $}}
   ; GFX12-NEXT: bb.2:
-  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3
+  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %41, %bb.3
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2237,7 +2237,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %
   ; GFX7-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
   ; GFX7-NEXT: {{  $}}
   ; GFX7-NEXT: bb.2:
-  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.3
+  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %43, %bb.3
   ; GFX7-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2294,7 +2294,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %
   ; GFX12-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
   ; GFX12-NEXT: {{  $}}
   ; GFX12-NEXT: bb.2:
-  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3
+  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %41, %bb.3
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2357,7 +2357,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %
   ; GFX7-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
   ; GFX7-NEXT: {{  $}}
   ; GFX7-NEXT: bb.2:
-  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.3
+  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %43, %bb.3
   ; GFX7-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2414,7 +2414,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %
   ; GFX12-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
   ; GFX12-NEXT: {{  $}}
   ; GFX12-NEXT: bb.2:
-  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3
+  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %41, %bb.3
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2476,7 +2476,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
   ; GFX7-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
   ; GFX7-NEXT: {{  $}}
   ; GFX7-NEXT: bb.2:
-  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
+  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
   ; GFX7-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2534,7 +2534,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
   ; GFX12-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
   ; GFX12-NEXT: {{  $}}
   ; GFX12-NEXT: bb.2:
-  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
+  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2596,7 +2596,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
   ; GFX7-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
   ; GFX7-NEXT: {{  $}}
   ; GFX7-NEXT: bb.2:
-  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
+  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
   ; GFX7-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2654,7 +2654,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
   ; GFX12-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
   ; GFX12-NEXT: {{  $}}
   ; GFX12-NEXT: bb.2:
-  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
+  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2716,7 +2716,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
   ; GFX7-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
   ; GFX7-NEXT: {{  $}}
   ; GFX7-NEXT: bb.2:
-  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
+  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
   ; GFX7-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2774,7 +2774,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
   ; GFX12-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
   ; GFX12-NEXT: {{  $}}
   ; GFX12-NEXT: bb.2:
-  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
+  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2835,7 +2835,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4
   ; GFX7-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
   ; GFX7-NEXT: {{  $}}
   ; GFX7-NEXT: bb.2:
-  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
+  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
   ; GFX7-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2891,7 +2891,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4
   ; GFX12-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
   ; GFX12-NEXT: {{  $}}
   ; GFX12-NEXT: bb.2:
-  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3
+  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %41, %bb.3
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index 168e6dfa5f147d..e361ebdf9b608e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -2750,25 +2750,32 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    v_min_i32_e32 v5, 0, v0
+; GFX6-NEXT:    v_min_i32_e32 v6, 0, v0
+; GFX6-NEXT:    v_bfrev_b32_e32 v7, 1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_max_i32_e32 v4, 0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0x80000000, v5
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v7, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0x7fffffff, v4
-; GFX6-NEXT:    v_max_i32_e32 v2, v5, v2
+; GFX6-NEXT:    v_max_i32_e32 v2, v6, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
 ; GFX6-NEXT:    v_min_i32_e32 v4, 0, v1
+; GFX6-NEXT:    v_bfrev_b32_e32 v5, -2
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, 0, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0x80000000, v4
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x7fffffff, v3
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v5, v3
 ; GFX6-NEXT:    v_max_i32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
index bac80f0777c024..d641913ada13da 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
@@ -636,8 +636,13 @@ define <2 x i16> @v_sext_inreg_v2i16_8(<2 x i16> %value) {
 ; GFX6-LABEL: v_sext_inreg_v2i16_8:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_sext_inreg_v2i16_8:
@@ -673,8 +678,13 @@ define <2 x i16> @v_sext_inreg_v2i16_15(<2 x i16> %value) {
 ; GFX6-LABEL: v_sext_inreg_v2i16_15:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 1
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 1
+; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_sext_inreg_v2i16_15:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
index 4cf1c92539c36f..08fc956f2dc45a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
@@ -720,6 +720,11 @@ define <2 x i16> @v_shl_v2i16(<2 x i16> %value, <2 x i16> %amount) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl_v2i16:
@@ -750,7 +755,10 @@ define <2 x i16> @v_shl_v2i16_15(<2 x i16> %value) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 15, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 15, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl_v2i16_15:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 2572f8581f0edf..4d5a8cb6d69020 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -2753,22 +2753,29 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_max_i32_e32 v4, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4
-; GFX6-NEXT:    v_min_i32_e32 v5, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, 0x80000000, v5
+; GFX6-NEXT:    v_min_i32_e32 v6, -1, v0
+; GFX6-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v7
 ; GFX6-NEXT:    v_max_i32_e32 v2, v4, v2
-; GFX6-NEXT:    v_min_i32_e32 v2, v2, v5
+; GFX6-NEXT:    v_min_i32_e32 v2, v2, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_bfrev_b32_e32 v5, -2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 0x7fffffff, v3
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v5
 ; GFX6-NEXT:    v_min_i32_e32 v4, -1, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 0x80000000, v4
 ; GFX6-NEXT:    v_max_i32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
-; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index 788692c94b0cfa..a52e70a4cfc488 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -1872,8 +1872,9 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_not_b32_e32 v3, v1
 ; GFX6-NEXT:    v_min_u32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index 0042d34e235d17..1e3c6d1559dab1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -1784,8 +1784,9 @@ define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_min_u32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index 9f093cc7b5abf2..d21e3e7165ef0c 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -9251,11 +9251,21 @@ define <4 x i16> @multi_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v2, v0, v2
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v1
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v3, v0, v1
+; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v0, v1
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v1, v5
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v7
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v2, v4
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v3, v5
+; GFX67-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX67-GISEL-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX67-GISEL-NEXT:    v_or_b32_e32 v2, v2, v1
+; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: multi_use_mul_mad_v2i16_var:
@@ -9373,12 +9383,17 @@ define <2 x i16> @other_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v1, v2
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v1
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX67-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GFX67-GISEL-NEXT:    s_mov_b32 m0, -1
+; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX67-GISEL-NEXT:    ds_write_b32 v6, v2
 ; GFX67-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index 4bed23487445a6..9661154a643815 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -6338,14 +6338,17 @@ define <2 x half> @v_exp_v2f16(<2 x half> %in) {
 ; SI-GISEL-LABEL: v_exp_v2f16:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_v2f16:
@@ -6444,15 +6447,18 @@ define <2 x half> @v_exp_fabs_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_exp_f32_e32 v2, v0
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_fabs_v2f16:
@@ -6556,15 +6562,18 @@ define <2 x half> @v_exp_fneg_fabs_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_exp_f32_e32 v2, v0
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_fneg_fabs_v2f16:
@@ -6669,15 +6678,18 @@ define <2 x half> @v_exp_fneg_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_exp_f32_e32 v2, v0
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_fneg_v2f16:
@@ -6758,19 +6770,22 @@ define <2 x half> @v_exp_v2f16_fast(<2 x half> %in) {
 ; SI-GISEL-LABEL: v_exp_v2f16_fast:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, 0x3dc5
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v2
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v2
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_v2f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index ec7e52532cd327..045492aeed07bc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -6431,14 +6431,17 @@ define <2 x half> @v_exp10_v2f16(<2 x half> %in) {
 ; SI-GISEL-LABEL: v_exp10_v2f16:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp10_v2f16:
@@ -6537,15 +6540,18 @@ define <2 x half> @v_exp10_fabs_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_exp_f32_e32 v2, v0
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp10_fabs_v2f16:
@@ -6649,15 +6655,18 @@ define <2 x half> @v_exp10_fneg_fabs_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_exp_f32_e32 v2, v0
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp10_fneg_fabs_v2f16:
@@ -6762,15 +6771,18 @@ define <2 x half> @v_exp10_fneg_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_exp_f32_e32 v2, v0
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp10_fneg_v2f16:
@@ -6852,19 +6864,22 @@ define <2 x half> @v_exp10_v2f16_fast(<2 x half> %in) {
 ; SI-GISEL-LABEL: v_exp10_v2f16_fast:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, 0x3dc5
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v2
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v2
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp10_v2f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 32b599e63c61d2..1e520c1750f5f4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -2307,12 +2307,15 @@ define <2 x half> @v_exp2_v2f16(<2 x half> %in) {
 ; SI-GISEL-LABEL: v_exp2_v2f16:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_v2f16:
@@ -2384,12 +2387,15 @@ define <2 x half> @v_exp2_fabs_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_fabs_v2f16:
@@ -2468,12 +2474,15 @@ define <2 x half> @v_exp2_fneg_fabs_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_fneg_fabs_v2f16:
@@ -2553,12 +2562,15 @@ define <2 x half> @v_exp2_fneg_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_fneg_v2f16:
@@ -2628,12 +2640,15 @@ define <2 x half> @v_exp2_v2f16_fast(<2 x half> %in) {
 ; SI-GISEL-LABEL: v_exp2_v2f16_fast:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_v2f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
index b9fef0834cb245..fa85f0db33e2ed 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
@@ -236,9 +236,12 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v5, v1
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
+; GFX6-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: test_frexp_v2f16_v2i32:
@@ -323,8 +326,11 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v2
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
index 72e86f1f6f9992..17b24ad2ee08ba 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
@@ -504,12 +504,15 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
 ; GFX6-GISEL-LABEL: test_ldexp_v2f16_v2i32:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v3
-; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: test_ldexp_v2f16_v2i32:
@@ -638,8 +641,11 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
 ; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GFX6-GISEL-NEXT:    v_bfe_i32 v2, v3, 0, 16
 ; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v2
-; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: test_ldexp_v2f16_v2i16:
@@ -1087,18 +1093,24 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) {
 ; GFX6-GISEL-LABEL: test_ldexp_v4f16_v4i32:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v4
+; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v5
-; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v2, v2, v6
+; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v4
+; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v3, v3, v7
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v2, v2, v6
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX6-GISEL-NEXT:    v_or_b32_e32 v2, v2, v1
+; GFX6-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: test_ldexp_v4f16_v4i32:
@@ -1292,11 +1304,17 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) {
 ; GFX6-GISEL-NEXT:    v_bfe_i32 v4, v6, 0, 16
 ; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v2, v2, v4
 ; GFX6-GISEL-NEXT:    v_bfe_i32 v4, v7, 0, 16
+; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v3, v3, v4
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX6-GISEL-NEXT:    v_or_b32_e32 v2, v2, v1
+; GFX6-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: test_ldexp_v4f16_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index 7f4cf19e9b85b4..897f0e9f024b2d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -6608,14 +6608,17 @@ define <2 x half> @v_log_v2f16(<2 x half> %in) {
 ; SI-GISEL-LABEL: v_log_v2f16:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317218, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_v2f16:
@@ -6701,15 +6704,18 @@ define <2 x half> @v_log_fabs_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317218, v2
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317218, v1
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_fabs_v2f16:
@@ -6827,15 +6833,18 @@ define <2 x half> @v_log_fneg_fabs_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317218, v2
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317218, v1
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_fneg_fabs_v2f16:
@@ -6954,15 +6963,18 @@ define <2 x half> @v_log_fneg_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317218, v2
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317218, v1
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_fneg_v2f16:
@@ -7072,14 +7084,17 @@ define <2 x half> @v_log_v2f16_fast(<2 x half> %in) {
 ; SI-GISEL-LABEL: v_log_v2f16_fast:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317218, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_v2f16_fast:
@@ -7363,22 +7378,28 @@ define <4 x half> @v_log_v4f16(<4 x half> %in) {
 ; SI-GISEL-LABEL: v_log_v4f16:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v3, v3
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
+; SI-GISEL-NEXT:    v_log_f32_e32 v2, v2
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317218, v1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317218, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317218, v3
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317218, v2
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_v4f16:
@@ -7531,22 +7552,28 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) {
 ; SI-GISEL-LABEL: v_log_v4f16_fast:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v3, v3
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
+; SI-GISEL-NEXT:    v_log_f32_e32 v2, v2
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317218, v1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317218, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317218, v3
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317218, v2
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 1c64e6b76c9577..74c56f5f22875e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -6608,14 +6608,17 @@ define <2 x half> @v_log10_v2f16(<2 x half> %in) {
 ; SI-GISEL-LABEL: v_log10_v2f16:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209b, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_v2f16:
@@ -6701,15 +6704,18 @@ define <2 x half> @v_log10_fabs_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209b, v2
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209b, v1
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_fabs_v2f16:
@@ -6827,15 +6833,18 @@ define <2 x half> @v_log10_fneg_fabs_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209b, v2
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209b, v1
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_fneg_fabs_v2f16:
@@ -6954,15 +6963,18 @@ define <2 x half> @v_log10_fneg_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209b, v2
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209b, v1
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_fneg_v2f16:
@@ -7072,14 +7084,17 @@ define <2 x half> @v_log10_v2f16_fast(<2 x half> %in) {
 ; SI-GISEL-LABEL: v_log10_v2f16_fast:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209b, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_v2f16_fast:
@@ -7363,22 +7378,28 @@ define <4 x half> @v_log10_v4f16(<4 x half> %in) {
 ; SI-GISEL-LABEL: v_log10_v4f16:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v3, v3
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
+; SI-GISEL-NEXT:    v_log_f32_e32 v2, v2
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209b, v1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209b, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a209b, v3
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209b, v2
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_v4f16:
@@ -7531,22 +7552,28 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) {
 ; SI-GISEL-LABEL: v_log10_v4f16_fast:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v3, v3
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
+; SI-GISEL-NEXT:    v_log_f32_e32 v2, v2
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209b, v1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209b, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a209b, v3
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209b, v2
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 50c52037dc4d31..87f46f6000961b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -3073,12 +3073,15 @@ define <2 x half> @v_log2_v2f16(<2 x half> %in) {
 ; SI-GISEL-LABEL: v_log2_v2f16:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_v2f16:
@@ -3161,12 +3164,15 @@ define <2 x half> @v_log2_fabs_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_fabs_v2f16:
@@ -3268,12 +3274,15 @@ define <2 x half> @v_log2_fneg_fabs_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_fneg_fabs_v2f16:
@@ -3376,12 +3385,15 @@ define <2 x half> @v_log2_fneg_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_fneg_v2f16:
@@ -3474,12 +3486,15 @@ define <2 x half> @v_log2_v2f16_fast(<2 x half> %in) {
 ; SI-GISEL-LABEL: v_log2_v2f16_fast:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_v2f16_fast:
@@ -3759,18 +3774,24 @@ define <4 x half> @v_log2_v4f16(<4 x half> %in) {
 ; SI-GISEL-LABEL: v_log2_v4f16:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v3, v3
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_log_f32_e32 v2, v2
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_v4f16:
@@ -3889,18 +3910,24 @@ define <4 x half> @v_log2_v4f16_fast(<4 x half> %in) {
 ; SI-GISEL-LABEL: v_log2_v4f16_fast:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v3, v3
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_log_f32_e32 v2, v2
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v1
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index 95d579be04ed27..267236e53b40bc 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -472,16 +472,19 @@ define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half
 ; GISEL-CI-LABEL: v_mad_mix_v2f32:
 ; GISEL-CI:       ; %bb.0:
 ; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GISEL-CI-NEXT:    v_mac_f32_e32 v4, v0, v2
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v5, v1, v3
-; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v4
-; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v5
+; GISEL-CI-NEXT:    v_mac_f32_e32 v4, v0, v2
+; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v5
+; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v4
+; GISEL-CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GISEL-CI-NEXT:    v_or_b32_e32 v0, v1, v0
+; GISEL-CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext <2 x half> %src0 to <2 x float>
   %src1.ext = fpext <2 x half> %src1 to <2 x float>
@@ -794,26 +797,32 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half
 ; GISEL-CI-LABEL: v_mad_mix_v4f32:
 ; GISEL-CI:       ; %bb.0:
 ; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GISEL-CI-NEXT:    v_mac_f32_e32 v8, v0, v4
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v9, v1, v5
-; GISEL-CI-NEXT:    v_mac_f32_e32 v10, v2, v6
+; GISEL-CI-NEXT:    v_mac_f32_e32 v8, v0, v4
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v11, v3, v7
-; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v8
-; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v9
-; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v2, v10
-; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v3, v11
+; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v9
+; GISEL-CI-NEXT:    v_mac_f32_e32 v10, v2, v6
+; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v8
+; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v2, v11
+; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v3, v10
+; GISEL-CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GISEL-CI-NEXT:    v_or_b32_e32 v0, v1, v0
+; GISEL-CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GISEL-CI-NEXT:    v_or_b32_e32 v2, v3, v1
+; GISEL-CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GISEL-CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext <4 x half> %src0 to <4 x float>
   %src1.ext = fpext <4 x half> %src1 to <4 x float>
@@ -909,30 +918,33 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s
 ; GISEL-CI-LABEL: v_mad_mix_v2f32_clamp_postcvt:
 ; GISEL-CI:       ; %bb.0:
 ; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GISEL-CI-NEXT:    v_mac_f32_e32 v5, v1, v3
+; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v5
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v4, v0, v2
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v4
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, 0
-; GISEL-CI-NEXT:    v_mac_f32_e32 v5, v1, v3
-; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v5
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT:    v_max_f32_e32 v0, v0, v2
-; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GISEL-CI-NEXT:    v_max_f32_e32 v0, v0, v2
+; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT:    v_min_f32_e32 v0, v0, v2
-; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GISEL-CI-NEXT:    v_min_f32_e32 v0, v0, v2
+; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-CI-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext <2 x half> %src0 to <2 x float>
   %src1.ext = fpext <2 x half> %src1 to <2 x float>
@@ -1322,52 +1334,58 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s
 ; GISEL-CI-LABEL: v_mad_mix_v4f32_clamp_postcvt:
 ; GISEL-CI:       ; %bb.0:
 ; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GISEL-CI-NEXT:    v_mac_f32_e32 v8, v0, v4
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v9, v1, v5
+; GISEL-CI-NEXT:    v_mac_f32_e32 v8, v0, v4
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v10, v2, v6
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v11, v3, v7
-; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v8
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v9
+; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v8
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v3, v10
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v4, v11
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, 0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GISEL-CI-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GISEL-CI-NEXT:    v_max_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT:    v_max_f32_e32 v0, v0, v2
+; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GISEL-CI-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GISEL-CI-NEXT:    v_max_f32_e32 v2, v4, v2
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v5, 1.0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GISEL-CI-NEXT:    v_min_f32_e32 v0, v0, v5
 ; GISEL-CI-NEXT:    v_min_f32_e32 v1, v1, v5
-; GISEL-CI-NEXT:    v_min_f32_e32 v2, v3, v5
-; GISEL-CI-NEXT:    v_min_f32_e32 v3, v4, v5
-; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT:    v_min_f32_e32 v0, v0, v5
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GISEL-CI-NEXT:    v_min_f32_e32 v2, v2, v5
+; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT:    v_min_f32_e32 v3, v3, v5
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GISEL-CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-CI-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GISEL-CI-NEXT:    v_or_b32_e32 v2, v3, v1
+; GISEL-CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GISEL-CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext <4 x half> %src0 to <4 x float>
   %src1.ext = fpext <4 x half> %src1 to <4 x float>
@@ -1514,17 +1532,15 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v4
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, 0
 ; GISEL-CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GISEL-CI-NEXT:    v_or_b32_e32 v0, v1, v0
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GISEL-CI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GISEL-CI-NEXT:    v_max_f32_e32 v1, v1, v2
-; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GISEL-CI-NEXT:    v_or_b32_e32 v1, v1, v0
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GISEL-CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GISEL-CI-NEXT:    v_max_f32_e32 v0, v0, v2
+; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT:    v_min_f32_e32 v1, v1, v2
-; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GISEL-CI-NEXT:    v_or_b32_e32 v0, v0, v1
-; GISEL-CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GISEL-CI-NEXT:    v_min_f32_e32 v0, v0, v2
+; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext <2 x half> %src0 to <2 x float>
   %src1.ext = fpext <2 x half> %src1 to <2 x float>
@@ -1676,16 +1692,12 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
 ; GISEL-CI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GISEL-CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GISEL-CI-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GISEL-CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GISEL-CI-NEXT:    v_or_b32_e32 v0, v0, v1
-; GISEL-CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext <2 x half> %src0 to <2 x float>
   %src1.ext = fpext <2 x half> %src1 to <2 x float>
@@ -1824,16 +1836,19 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr
 ; GISEL-CI-LABEL: v_mad_mix_v2f32_clamp_precvt:
 ; GISEL-CI:       ; %bb.0:
 ; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GISEL-CI-NEXT:    v_mad_f32 v0, v0, v2, v4 clamp
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GISEL-CI-NEXT:    v_mad_f32 v1, v1, v3, v5 clamp
-; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GISEL-CI-NEXT:    v_mad_f32 v0, v0, v2, v4 clamp
+; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-CI-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext <2 x half> %src0 to <2 x float>
   %src1.ext = fpext <2 x half> %src1 to <2 x float>
@@ -2222,26 +2237,32 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
 ; GISEL-CI-LABEL: v_mad_mix_v4f32_clamp_precvt:
 ; GISEL-CI:       ; %bb.0:
 ; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GISEL-CI-NEXT:    v_mad_f32 v0, v0, v4, v8 clamp
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
 ; GISEL-CI-NEXT:    v_mad_f32 v1, v1, v5, v9 clamp
-; GISEL-CI-NEXT:    v_mad_f32 v2, v2, v6, v10 clamp
+; GISEL-CI-NEXT:    v_mad_f32 v0, v0, v4, v8 clamp
 ; GISEL-CI-NEXT:    v_mad_f32 v3, v3, v7, v11 clamp
-; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GISEL-CI-NEXT:    v_mad_f32 v2, v2, v6, v10 clamp
+; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GISEL-CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-CI-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GISEL-CI-NEXT:    v_or_b32_e32 v2, v2, v1
+; GISEL-CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GISEL-CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext <4 x half> %src0 to <4 x float>
   %src1.ext = fpext <4 x half> %src1 to <4 x float>
diff --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll
index 0f95c0255d3abc..3015707418d0aa 100644
--- a/llvm/test/CodeGen/AMDGPU/roundeven.ll
+++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll
@@ -404,23 +404,29 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
 ; GFX6-LABEL: v_roundeven_v2f16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_rndne_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_roundeven_v2f16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    v_rndne_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_roundeven_v2f16:
@@ -522,13 +528,16 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; GFX6-NEXT:    v_rndne_f32_e32 v0, v1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_rndne_f32_e32 v1, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX6-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_roundeven_v2f16_fneg:
@@ -538,13 +547,16 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; GFX7-NEXT:    v_rndne_f32_e32 v0, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_rndne_f32_e32 v1, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_roundeven_v2f16_fneg:
@@ -655,35 +667,47 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
 ; GFX6-LABEL: v_roundeven_v4f16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    v_rndne_f32_e32 v1, v1
-; GFX6-NEXT:    v_rndne_f32_e32 v2, v2
+; GFX6-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_rndne_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_rndne_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_roundeven_v4f16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX7-NEXT:    v_rndne_f32_e32 v1, v1
-; GFX7-NEXT:    v_rndne_f32_e32 v2, v2
+; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_rndne_f32_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_rndne_f32_e32 v2, v2
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_roundeven_v4f16:
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
index d9e0e0298e072f..6dedc6920a30e7 100644
--- a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
@@ -11,11 +11,11 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY killed $sgpr1
   ; CHECK-NEXT:   undef [[COPY2:%[0-9]+]].sub0:sreg_64 = COPY killed [[COPY]]
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]].sub1:sreg_64 = COPY killed [[COPY1]]
-  ; CHECK-NEXT:   early-clobber %11:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY2]], 0, 0 :: (invariant load (<2 x s32>) from %ir.ptr, align 4, addrspace 4)
+  ; CHECK-NEXT:   early-clobber %17:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY2]], 0, 0 :: (invariant load (<2 x s32>) from %ir.ptr, align 4, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[COPY2]], 8, 0 :: (invariant load (s32) from %ir.ptr + 8, addrspace 4)
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY %11.sub0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY %17.sub0
   ; CHECK-NEXT:   $sgpr0 = COPY killed [[COPY3]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY killed %11.sub1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY killed %17.sub1
   ; CHECK-NEXT:   $sgpr1 = COPY killed [[COPY4]]
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORD_IMM]]
   ; CHECK-NEXT:   $sgpr2 = COPY killed [[COPY5]]

>From 55aa772009b50313f7acfbfe586d32c117c1af37 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Sat, 19 Oct 2024 18:52:53 +0200
Subject: [PATCH 2/2] second round

---
 .../GlobalISel/CombinerHelperArtifacts.cpp    |   2 +-
 llvm/lib/Target/AArch64/AArch64Combine.td     |   2 +-
 .../AArch64/GlobalISel/combine-unmerge.mir    |  17 +-
 llvm/test/CodeGen/AArch64/abs.ll              |  91 +----
 llvm/test/CodeGen/AArch64/bitcast.ll          |   8 +-
 llvm/test/CodeGen/AArch64/bswap.ll            |  27 +-
 llvm/test/CodeGen/AArch64/fabs.ll             |  34 +-
 llvm/test/CodeGen/AArch64/faddsub.ll          | 132 ++-----
 llvm/test/CodeGen/AArch64/fcmp.ll             | 298 +++++---------
 llvm/test/CodeGen/AArch64/fcopysign.ll        |  55 +--
 llvm/test/CodeGen/AArch64/fcvt.ll             | 371 +++++-------------
 llvm/test/CodeGen/AArch64/fdiv.ll             |  66 +---
 llvm/test/CodeGen/AArch64/fexplog.ll          |  40 +-
 llvm/test/CodeGen/AArch64/fminimummaximum.ll  | 132 ++-----
 llvm/test/CodeGen/AArch64/fminmax.ll          | 132 ++-----
 llvm/test/CodeGen/AArch64/fmla.ll             | 248 +++---------
 llvm/test/CodeGen/AArch64/fmul.ll             |  66 +---
 llvm/test/CodeGen/AArch64/fneg.ll             |  32 +-
 llvm/test/CodeGen/AArch64/fpow.ll             |   8 +-
 llvm/test/CodeGen/AArch64/fpowi.ll            |   8 +-
 .../test/CodeGen/AArch64/fptosi-sat-vector.ll | 161 ++++----
 .../test/CodeGen/AArch64/fptoui-sat-vector.ll | 138 +++----
 llvm/test/CodeGen/AArch64/frem.ll             |   8 +-
 llvm/test/CodeGen/AArch64/fsincos.ll          |  16 +-
 llvm/test/CodeGen/AArch64/fsqrt.ll            |  49 +--
 llvm/test/CodeGen/AArch64/load.ll             |  69 ++--
 llvm/test/CodeGen/AArch64/shift.ll            | 348 ++--------------
 llvm/test/CodeGen/AArch64/shufflevector.ll    | 158 +-------
 .../regbankselect-amdgcn.s.buffer.load.ll     |  28 +-
 29 files changed, 661 insertions(+), 2083 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
index 805d34ae0493c4..cab250ee7e62fa 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
@@ -113,7 +113,7 @@ bool CombinerHelper::matchUnmergeValuesAnyExtBuildVector(const MachineInstr &MI,
 bool CombinerHelper::matchUnmergeValuesOfScalarAndVector(const MachineInstr &MI,
                                                          BuildFnTy &MatchInfo) {
 
-  constexpr unsigned MAX_NUM_DEFS_LIMIT = 8;
+  constexpr unsigned MAX_NUM_DEFS_LIMIT = 4;
 
   //  %opaque:_(<2 x s64>) = G_OPAQUE
   //  %un1:_(s64), %un2:_(s64) = G_UNMERGE_VALUES %opaque(<2 x s64>)
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 1eb7488e4ff570..8af8cdfeba6ac4 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -322,7 +322,7 @@ def AArch64PostLegalizerCombiner
                         extractvecelt_pairwise_add, redundant_or,
                         mul_const, redundant_sext_inreg,
                         form_bitfield_extract, rotate_out_of_range,
-                        icmp_to_true_false_known_bits, vector_ops_combines,
+                        icmp_to_true_false_known_bits,
                         select_combines, fold_merge_to_zext,
                         constant_fold_binops, identity_combines,
                         ptr_add_immed_chain, overlapping_and,
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
index fc7584a2e1b162..e401cebd93a924 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
@@ -585,22 +585,7 @@ body:             |
   bb.1:
     ; CHECK-LABEL: name: test_long_opaque_vector_scalar
     ; CHECK: %opaque:_(<8 x s16>) = COPY $q0
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; CHECK-NEXT: %un1:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C]](s64)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: %un2:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C1]](s64)
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; CHECK-NEXT: %un3:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C2]](s64)
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
-    ; CHECK-NEXT: %un4:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C3]](s64)
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; CHECK-NEXT: %un5:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C4]](s64)
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
-    ; CHECK-NEXT: %un6:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C5]](s64)
-    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
-    ; CHECK-NEXT: %un7:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C6]](s64)
-    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 7
-    ; CHECK-NEXT: %un8:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C7]](s64)
+    ; CHECK-NEXT: %un1:_(s16), %un2:_(s16), %un3:_(s16), %un4:_(s16), %un5:_(s16), %un6:_(s16), %un7:_(s16), %un8:_(s16) = G_UNMERGE_VALUES %opaque(<8 x s16>)
     ; CHECK-NEXT: %zext1:_(s32) = G_ZEXT %un1(s16)
     ; CHECK-NEXT: %zext2:_(s32) = G_ZEXT %un2(s16)
     ; CHECK-NEXT: %zext3:_(s32) = G_ZEXT %un3(s16)
diff --git a/llvm/test/CodeGen/AArch64/abs.ll b/llvm/test/CodeGen/AArch64/abs.ll
index b5794007bdddb0..29fe2d02a93e11 100644
--- a/llvm/test/CodeGen/AArch64/abs.ll
+++ b/llvm/test/CodeGen/AArch64/abs.ll
@@ -355,66 +355,10 @@ entry:
 declare <3 x i8> @llvm.abs.v3i8(<3 x i8>, i1)
 
 define <7 x i8> @abs_v7i8(<7 x i8> %a){
-; CHECK-SD-LABEL: abs_v7i8:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    abs v0.8b, v0.8b
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: abs_v7i8:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
-; CHECK-GI-NEXT:    mov b3, v0.b[2]
-; CHECK-GI-NEXT:    mov v2.b[1], v1.b[0]
-; CHECK-GI-NEXT:    mov b1, v0.b[3]
-; CHECK-GI-NEXT:    mov v2.b[2], v3.b[0]
-; CHECK-GI-NEXT:    mov b3, v0.b[4]
-; CHECK-GI-NEXT:    mov v2.b[3], v1.b[0]
-; CHECK-GI-NEXT:    mov b1, v0.b[5]
-; CHECK-GI-NEXT:    mov b0, v0.b[6]
-; CHECK-GI-NEXT:    mov v2.b[4], v3.b[0]
-; CHECK-GI-NEXT:    mov v2.b[5], v1.b[0]
-; CHECK-GI-NEXT:    mov v2.b[6], v0.b[0]
-; CHECK-GI-NEXT:    abs v0.8b, v2.8b
-; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov b4, v0.b[4]
-; CHECK-GI-NEXT:    mov b5, v0.b[6]
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    mov b1, v0.b[5]
-; CHECK-GI-NEXT:    mov v0.h[1], w8
-; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    mov v0.h[2], w8
-; CHECK-GI-NEXT:    fmov w8, s3
-; CHECK-GI-NEXT:    mov v0.h[3], w8
-; CHECK-GI-NEXT:    fmov w8, s4
-; CHECK-GI-NEXT:    mov v0.h[4], w8
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    mov v0.h[5], w8
-; CHECK-GI-NEXT:    fmov w8, s5
-; CHECK-GI-NEXT:    mov v0.h[6], w8
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[3]
-; CHECK-GI-NEXT:    mov h3, v0.h[4]
-; CHECK-GI-NEXT:    mov h4, v0.h[5]
-; CHECK-GI-NEXT:    mov h5, v0.h[6]
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    mov h1, v0.h[2]
-; CHECK-GI-NEXT:    mov v0.b[1], w8
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    mov v0.b[2], w8
-; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    mov v0.b[3], w8
-; CHECK-GI-NEXT:    fmov w8, s3
-; CHECK-GI-NEXT:    mov v0.b[4], w8
-; CHECK-GI-NEXT:    fmov w8, s4
-; CHECK-GI-NEXT:    mov v0.b[5], w8
-; CHECK-GI-NEXT:    fmov w8, s5
-; CHECK-GI-NEXT:    mov v0.b[6], w8
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: abs_v7i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    abs v0.8b, v0.8b
+; CHECK-NEXT:    ret
 entry:
   %res = call <7 x i8> @llvm.abs.v7i8(<7 x i8> %a, i1 0)
   ret <7 x i8> %res
@@ -453,29 +397,10 @@ entry:
 declare <3 x i16> @llvm.abs.v3i16(<3 x i16>, i1)
 
 define <7 x i16> @abs_v7i16(<7 x i16> %a){
-; CHECK-SD-LABEL: abs_v7i16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    abs v0.8h, v0.8h
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: abs_v7i16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
-; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
-; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
-; CHECK-GI-NEXT:    mov v1.h[4], v0.h[4]
-; CHECK-GI-NEXT:    mov v1.h[5], v0.h[5]
-; CHECK-GI-NEXT:    mov v1.h[6], v0.h[6]
-; CHECK-GI-NEXT:    abs v1.8h, v1.8h
-; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: abs_v7i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    abs v0.8h, v0.8h
+; CHECK-NEXT:    ret
 entry:
   %res = call <7 x i16> @llvm.abs.v7i16(<7 x i16> %a, i1 0)
   ret <7 x i16> %res
diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll
index 8449b69a473d92..e34bac2e2fa69a 100644
--- a/llvm/test/CodeGen/AArch64/bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast.ll
@@ -647,13 +647,7 @@ define <6 x i16> @bitcast_v3i32_v6i16(<3 x i32> %a, <3 x i32> %b){
 ; CHECK-GI-NEXT:    mov v3.s[1], v1.s[1]
 ; CHECK-GI-NEXT:    mov v2.s[2], v0.s[2]
 ; CHECK-GI-NEXT:    mov v3.s[2], v1.s[2]
-; CHECK-GI-NEXT:    add v1.4s, v2.4s, v3.4s
-; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT:    add v0.4s, v2.4s, v3.4s
 ; CHECK-GI-NEXT:    ret
   %c = add <3 x i32> %a, %b
   %d = bitcast <3 x i32> %c to <6 x i16>
diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll
index 9f9653fcbb50b5..fd1ac47bef7d15 100644
--- a/llvm/test/CodeGen/AArch64/bswap.ll
+++ b/llvm/test/CodeGen/AArch64/bswap.ll
@@ -277,29 +277,10 @@ entry:
 declare <3 x i16> @llvm.bswap.v3i16(<3 x i16>)
 
 define <7 x i16> @bswap_v7i16(<7 x i16> %a){
-; CHECK-SD-LABEL: bswap_v7i16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    rev16 v0.16b, v0.16b
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: bswap_v7i16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
-; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
-; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
-; CHECK-GI-NEXT:    mov v1.h[4], v0.h[4]
-; CHECK-GI-NEXT:    mov v1.h[5], v0.h[5]
-; CHECK-GI-NEXT:    mov v1.h[6], v0.h[6]
-; CHECK-GI-NEXT:    rev16 v1.16b, v1.16b
-; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: bswap_v7i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rev16 v0.16b, v0.16b
+; CHECK-NEXT:    ret
 entry:
   %res = call <7 x i16> @llvm.bswap.v7i16(<7 x i16> %a)
   ret <7 x i16> %res
diff --git a/llvm/test/CodeGen/AArch64/fabs.ll b/llvm/test/CodeGen/AArch64/fabs.ll
index 1aed6cb8bf9ed8..0e1f9fba307add 100644
--- a/llvm/test/CodeGen/AArch64/fabs.ll
+++ b/llvm/test/CodeGen/AArch64/fabs.ll
@@ -174,41 +174,13 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fabs_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[4], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[5], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[6], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mvni v0.8h, #128, lsl #8
-; CHECK-GI-NOFP16-NEXT:    and v1.16b, v1.16b, v0.16b
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mvni v1.8h, #128, lsl #8
+; CHECK-GI-NOFP16-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fabs_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    mov v1.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT:    mov v1.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT:    mov v1.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT:    mov v1.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT:    mov v1.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT:    fabs v1.8h, v1.8h
-; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT:    fabs v0.8h, v0.8h
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.fabs.v7f16(<7 x half> %a)
diff --git a/llvm/test/CodeGen/AArch64/faddsub.ll b/llvm/test/CodeGen/AArch64/faddsub.ll
index 4227c891d844f4..de9a458a98b60f 100644
--- a/llvm/test/CodeGen/AArch64/faddsub.ll
+++ b/llvm/test/CodeGen/AArch64/faddsub.ll
@@ -201,68 +201,32 @@ define <7 x half> @fadd_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fadd_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    fadd v2.4s, v2.4s, v3.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    fadd v0.4s, v0.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[4], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[5], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[6], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v1.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fadd_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    mov v2.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v3.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v2.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov v3.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov v2.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT:    mov v3.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT:    mov v2.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT:    mov v2.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT:    mov v3.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT:    mov v2.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT:    mov v3.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT:    mov v2.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT:    mov v3.h[6], v1.h[6]
-; CHECK-GI-FP16-NEXT:    fadd v1.8h, v2.8h, v3.8h
-; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT:    fadd v0.8h, v0.8h, v1.8h
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fadd <7 x half> %a, %b
@@ -593,68 +557,32 @@ define <7 x half> @fsub_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fsub_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    fsub v2.4s, v2.4s, v3.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    fsub v0.4s, v0.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[4], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[5], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[6], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    fsub v1.4s, v1.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fsub_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    mov v2.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v3.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v2.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov v3.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov v2.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT:    mov v3.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT:    mov v2.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT:    mov v2.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT:    mov v3.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT:    mov v2.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT:    mov v3.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT:    mov v2.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT:    mov v3.h[6], v1.h[6]
-; CHECK-GI-FP16-NEXT:    fsub v1.8h, v2.8h, v3.8h
-; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT:    fsub v0.8h, v0.8h, v1.8h
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fsub <7 x half> %a, %b
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index 584ffa92493d08..c1459ac5b56434 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -1245,134 +1245,70 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x
 ;
 ; CHECK-GI-NOFP16-LABEL: v7f16_half:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov w8, #15 // =0xf
-; CHECK-GI-NOFP16-NEXT:    fmov s6, w8
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    fmov s5, w8
 ; CHECK-GI-NOFP16-NEXT:    mov w9, #65535 // =0xffff
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[0], v1.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fmov s7, w9
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[0], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v19.h[0], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], w8
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], w8
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov v7.h[1], w9
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[1], v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v19.h[1], v3.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], w8
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], w8
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v7.h[2], w9
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v19.h[2], v3.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[3], w8
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v17.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], w8
+; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v6.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v7.h[3], w9
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v19.h[3], v3.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v5.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v16.4h
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[4], w8
+; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[4], w8
+; CHECK-GI-NOFP16-NEXT:    fcmgt v1.4s, v6.4s, v4.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v7.h[4], w9
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[4], v2.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v19.h[4], v3.h[4]
-; CHECK-GI-NOFP16-NEXT:    fcmgt v1.4s, v4.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v5.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[5], w8
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[5], w8
+; CHECK-GI-NOFP16-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-GI-NOFP16-NEXT:    mov v7.h[5], w9
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[5], v2.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v19.h[5], v3.h[5]
-; CHECK-GI-NOFP16-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[6], w8
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[6], w8
 ; CHECK-GI-NOFP16-NEXT:    mov v7.h[6], w9
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[6], v2.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v19.h[6], v3.h[6]
-; CHECK-GI-NOFP16-NEXT:    ushl v0.8h, v0.8h, v6.8h
-; CHECK-GI-NOFP16-NEXT:    neg v1.8h, v6.8h
+; CHECK-GI-NOFP16-NEXT:    ushl v0.8h, v0.8h, v5.8h
+; CHECK-GI-NOFP16-NEXT:    neg v1.8h, v5.8h
 ; CHECK-GI-NOFP16-NEXT:    sshl v0.8h, v0.8h, v1.8h
 ; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v0.16b, v7.16b
-; CHECK-GI-NOFP16-NEXT:    and v0.16b, v18.16b, v0.16b
-; CHECK-GI-NOFP16-NEXT:    and v1.16b, v19.16b, v1.16b
-; CHECK-GI-NOFP16-NEXT:    orr v1.16b, v0.16b, v1.16b
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    and v0.16b, v2.16b, v0.16b
+; CHECK-GI-NOFP16-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: v7f16_half:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    mov v4.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v5.h[0], v1.h[0]
 ; CHECK-GI-FP16-NEXT:    mov w8, #15 // =0xf
-; CHECK-GI-FP16-NEXT:    fmov s6, w8
 ; CHECK-GI-FP16-NEXT:    mov w9, #65535 // =0xffff
-; CHECK-GI-FP16-NEXT:    mov v16.h[0], v2.h[0]
-; CHECK-GI-FP16-NEXT:    fmov s7, w9
-; CHECK-GI-FP16-NEXT:    mov v17.h[0], v3.h[0]
-; CHECK-GI-FP16-NEXT:    mov v4.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov v5.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov v6.h[1], w8
-; CHECK-GI-FP16-NEXT:    mov v7.h[1], w9
-; CHECK-GI-FP16-NEXT:    mov v16.h[1], v2.h[1]
-; CHECK-GI-FP16-NEXT:    mov v17.h[1], v3.h[1]
-; CHECK-GI-FP16-NEXT:    mov v4.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT:    mov v5.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT:    mov v6.h[2], w8
-; CHECK-GI-FP16-NEXT:    mov v7.h[2], w9
-; CHECK-GI-FP16-NEXT:    mov v16.h[2], v2.h[2]
-; CHECK-GI-FP16-NEXT:    mov v17.h[2], v3.h[2]
-; CHECK-GI-FP16-NEXT:    mov v4.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT:    mov v5.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT:    mov v6.h[3], w8
-; CHECK-GI-FP16-NEXT:    mov v7.h[3], w9
-; CHECK-GI-FP16-NEXT:    mov v16.h[3], v2.h[3]
-; CHECK-GI-FP16-NEXT:    mov v17.h[3], v3.h[3]
-; CHECK-GI-FP16-NEXT:    mov v4.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT:    mov v5.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT:    mov v6.h[4], w8
-; CHECK-GI-FP16-NEXT:    mov v7.h[4], w9
-; CHECK-GI-FP16-NEXT:    mov v16.h[4], v2.h[4]
-; CHECK-GI-FP16-NEXT:    mov v17.h[4], v3.h[4]
-; CHECK-GI-FP16-NEXT:    mov v4.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT:    mov v5.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT:    mov v6.h[5], w8
-; CHECK-GI-FP16-NEXT:    mov v7.h[5], w9
-; CHECK-GI-FP16-NEXT:    mov v16.h[5], v2.h[5]
-; CHECK-GI-FP16-NEXT:    mov v17.h[5], v3.h[5]
-; CHECK-GI-FP16-NEXT:    mov v4.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT:    mov v5.h[6], v1.h[6]
-; CHECK-GI-FP16-NEXT:    mov v6.h[6], w8
-; CHECK-GI-FP16-NEXT:    mov v7.h[6], w9
-; CHECK-GI-FP16-NEXT:    mov v16.h[6], v2.h[6]
-; CHECK-GI-FP16-NEXT:    mov v17.h[6], v3.h[6]
-; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v5.8h, v4.8h
-; CHECK-GI-FP16-NEXT:    neg v1.8h, v6.8h
-; CHECK-GI-FP16-NEXT:    ushl v0.8h, v0.8h, v6.8h
+; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    fmov s4, w8
+; CHECK-GI-FP16-NEXT:    fmov s5, w9
+; CHECK-GI-FP16-NEXT:    mov v4.h[1], w8
+; CHECK-GI-FP16-NEXT:    mov v5.h[1], w9
+; CHECK-GI-FP16-NEXT:    mov v4.h[2], w8
+; CHECK-GI-FP16-NEXT:    mov v5.h[2], w9
+; CHECK-GI-FP16-NEXT:    mov v4.h[3], w8
+; CHECK-GI-FP16-NEXT:    mov v5.h[3], w9
+; CHECK-GI-FP16-NEXT:    mov v4.h[4], w8
+; CHECK-GI-FP16-NEXT:    mov v5.h[4], w9
+; CHECK-GI-FP16-NEXT:    mov v4.h[5], w8
+; CHECK-GI-FP16-NEXT:    mov v5.h[5], w9
+; CHECK-GI-FP16-NEXT:    mov v4.h[6], w8
+; CHECK-GI-FP16-NEXT:    mov v5.h[6], w9
+; CHECK-GI-FP16-NEXT:    ushl v0.8h, v0.8h, v4.8h
+; CHECK-GI-FP16-NEXT:    neg v1.8h, v4.8h
 ; CHECK-GI-FP16-NEXT:    sshl v0.8h, v0.8h, v1.8h
-; CHECK-GI-FP16-NEXT:    eor v1.16b, v0.16b, v7.16b
-; CHECK-GI-FP16-NEXT:    and v0.16b, v16.16b, v0.16b
-; CHECK-GI-FP16-NEXT:    and v1.16b, v17.16b, v1.16b
-; CHECK-GI-FP16-NEXT:    orr v1.16b, v0.16b, v1.16b
-; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT:    eor v1.16b, v0.16b, v5.16b
+; CHECK-GI-FP16-NEXT:    and v0.16b, v2.16b, v0.16b
+; CHECK-GI-FP16-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-GI-FP16-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fcmp olt <7 x half> %a, %b
@@ -1795,69 +1731,61 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ;
 ; CHECK-GI-NOFP16-LABEL: v7f16_i32:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[4]
 ; CHECK-GI-NOFP16-NEXT:    mov w8, #31 // =0x1f
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v4.s[0], w8
 ; CHECK-GI-NOFP16-NEXT:    mov w9, #-1 // =0xffffffff
-; CHECK-GI-NOFP16-NEXT:    mov v6.s[0], w8
-; CHECK-GI-NOFP16-NEXT:    mov v16.s[0], w9
-; CHECK-GI-NOFP16-NEXT:    ldr s18, [sp]
-; CHECK-GI-NOFP16-NEXT:    mov v7.s[0], w0
-; CHECK-GI-NOFP16-NEXT:    mov v17.s[0], w7
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v6.s[1], w8
-; CHECK-GI-NOFP16-NEXT:    mov v16.s[1], w9
-; CHECK-GI-NOFP16-NEXT:    mov v7.s[1], w1
-; CHECK-GI-NOFP16-NEXT:    mov v17.s[1], v18.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v5.s[0], w0
+; CHECK-GI-NOFP16-NEXT:    mov v6.s[0], w9
+; CHECK-GI-NOFP16-NEXT:    mov v7.s[0], w7
+; CHECK-GI-NOFP16-NEXT:    ldr s16, [sp]
+; CHECK-GI-NOFP16-NEXT:    ldr s17, [sp, #24]
 ; CHECK-GI-NOFP16-NEXT:    ldr s18, [sp, #32]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v6.s[2], w8
-; CHECK-GI-NOFP16-NEXT:    mov v16.s[2], w9
-; CHECK-GI-NOFP16-NEXT:    mov v7.s[2], w2
-; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v5.4h
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], w4
-; CHECK-GI-NOFP16-NEXT:    mov v7.s[3], w3
-; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v5.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT:    ldr s5, [sp, #24]
-; CHECK-GI-NOFP16-NEXT:    ldr s4, [sp, #8]
-; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], w5
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v4.s[1], w8
+; CHECK-GI-NOFP16-NEXT:    mov v5.s[1], w1
+; CHECK-GI-NOFP16-NEXT:    mov v17.s[1], v18.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v6.s[1], w9
+; CHECK-GI-NOFP16-NEXT:    mov v7.s[1], v16.s[0]
+; CHECK-GI-NOFP16-NEXT:    ldr s16, [sp, #8]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov v4.s[2], w8
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    mov v5.s[2], w2
+; CHECK-GI-NOFP16-NEXT:    mov v6.s[2], w9
+; CHECK-GI-NOFP16-NEXT:    mov v7.s[2], v16.s[0]
+; CHECK-GI-NOFP16-NEXT:    ldr s16, [sp, #40]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    mov v5.s[1], v18.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v17.s[2], v4.s[0]
-; CHECK-GI-NOFP16-NEXT:    ldr s4, [sp, #40]
-; CHECK-GI-NOFP16-NEXT:    ushl v0.4s, v0.4s, v6.4s
-; CHECK-GI-NOFP16-NEXT:    neg v6.4s, v6.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.s[2], w6
+; CHECK-GI-NOFP16-NEXT:    mov v17.s[2], v16.s[0]
+; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v5.s[3], w3
 ; CHECK-GI-NOFP16-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v5.s[2], v4.s[0]
-; CHECK-GI-NOFP16-NEXT:    sshl v0.4s, v0.4s, v6.4s
-; CHECK-GI-NOFP16-NEXT:    ldr s6, [sp, #16]
-; CHECK-GI-NOFP16-NEXT:    mov v17.s[3], v6.s[0]
-; CHECK-GI-NOFP16-NEXT:    eor v3.16b, v0.16b, v16.16b
-; CHECK-GI-NOFP16-NEXT:    and v0.16b, v1.16b, v0.16b
-; CHECK-GI-NOFP16-NEXT:    and v1.16b, v5.16b, v3.16b
-; CHECK-GI-NOFP16-NEXT:    bsl v2.16b, v7.16b, v17.16b
-; CHECK-GI-NOFP16-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-GI-NOFP16-NEXT:    mov s1, v2.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov s3, v2.s[2]
-; CHECK-GI-NOFP16-NEXT:    mov s4, v2.s[3]
-; CHECK-GI-NOFP16-NEXT:    fmov w0, s2
-; CHECK-GI-NOFP16-NEXT:    mov s5, v0.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov s6, v0.s[2]
-; CHECK-GI-NOFP16-NEXT:    fmov w4, s0
-; CHECK-GI-NOFP16-NEXT:    fmov w1, s1
+; CHECK-GI-NOFP16-NEXT:    mov v3.s[0], w4
+; CHECK-GI-NOFP16-NEXT:    ushl v2.4s, v2.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT:    neg v4.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT:    mov v3.s[1], w5
+; CHECK-GI-NOFP16-NEXT:    sshl v2.4s, v2.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT:    ldr s4, [sp, #16]
+; CHECK-GI-NOFP16-NEXT:    mov v3.s[2], w6
+; CHECK-GI-NOFP16-NEXT:    mov v7.s[3], v4.s[0]
+; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v2.16b, v6.16b
+; CHECK-GI-NOFP16-NEXT:    and v2.16b, v3.16b, v2.16b
+; CHECK-GI-NOFP16-NEXT:    and v1.16b, v17.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v5.16b, v7.16b
+; CHECK-GI-NOFP16-NEXT:    orr v1.16b, v2.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov s3, v0.s[2]
+; CHECK-GI-NOFP16-NEXT:    mov s4, v0.s[3]
+; CHECK-GI-NOFP16-NEXT:    fmov w0, s0
+; CHECK-GI-NOFP16-NEXT:    mov s5, v1.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov s6, v1.s[2]
+; CHECK-GI-NOFP16-NEXT:    fmov w4, s1
+; CHECK-GI-NOFP16-NEXT:    fmov w1, s2
 ; CHECK-GI-NOFP16-NEXT:    fmov w2, s3
 ; CHECK-GI-NOFP16-NEXT:    fmov w3, s4
 ; CHECK-GI-NOFP16-NEXT:    fmov w5, s5
@@ -1866,51 +1794,37 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ;
 ; CHECK-GI-FP16-LABEL: v7f16_i32:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    mov v2.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
 ; CHECK-GI-FP16-NEXT:    mov w9, #31 // =0x1f
 ; CHECK-GI-FP16-NEXT:    mov v4.s[0], w0
+; CHECK-GI-FP16-NEXT:    mov v2.s[0], w9
 ; CHECK-GI-FP16-NEXT:    mov v5.s[0], w7
 ; CHECK-GI-FP16-NEXT:    ldr s6, [sp]
 ; CHECK-GI-FP16-NEXT:    mov v7.s[0], w4
 ; CHECK-GI-FP16-NEXT:    ldr s16, [sp, #32]
 ; CHECK-GI-FP16-NEXT:    ldr s17, [sp, #8]
-; CHECK-GI-FP16-NEXT:    mov v2.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT:    umov w8, v0.h[4]
+; CHECK-GI-FP16-NEXT:    umov w10, v0.h[5]
 ; CHECK-GI-FP16-NEXT:    mov v4.s[1], w1
+; CHECK-GI-FP16-NEXT:    mov v2.s[1], w9
 ; CHECK-GI-FP16-NEXT:    mov v5.s[1], v6.s[0]
 ; CHECK-GI-FP16-NEXT:    ldr s6, [sp, #24]
 ; CHECK-GI-FP16-NEXT:    mov v7.s[1], w5
 ; CHECK-GI-FP16-NEXT:    mov v6.s[1], v16.s[0]
 ; CHECK-GI-FP16-NEXT:    ldr s16, [sp, #40]
-; CHECK-GI-FP16-NEXT:    mov v2.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v1.s[0], w8
+; CHECK-GI-FP16-NEXT:    umov w8, v0.h[6]
+; CHECK-GI-FP16-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-FP16-NEXT:    mov v2.s[2], w9
 ; CHECK-GI-FP16-NEXT:    mov v4.s[2], w2
 ; CHECK-GI-FP16-NEXT:    mov v5.s[2], v17.s[0]
 ; CHECK-GI-FP16-NEXT:    mov v7.s[2], w6
+; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
 ; CHECK-GI-FP16-NEXT:    mov v6.s[2], v16.s[0]
-; CHECK-GI-FP16-NEXT:    mov v2.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT:    mov v4.s[3], w3
-; CHECK-GI-FP16-NEXT:    mov v2.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT:    mov v3.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT:    mov v2.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT:    mov v3.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT:    mov v2.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT:    mov v3.h[6], v1.h[6]
-; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v3.8h, v2.8h
-; CHECK-GI-FP16-NEXT:    mov v2.s[0], w9
-; CHECK-GI-FP16-NEXT:    umov w8, v0.h[4]
-; CHECK-GI-FP16-NEXT:    umov w10, v0.h[5]
-; CHECK-GI-FP16-NEXT:    mov v2.s[1], w9
-; CHECK-GI-FP16-NEXT:    mov v1.s[0], w8
-; CHECK-GI-FP16-NEXT:    umov w8, v0.h[6]
-; CHECK-GI-FP16-NEXT:    mov v2.s[2], w9
-; CHECK-GI-FP16-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-FP16-NEXT:    mov v1.s[1], w10
 ; CHECK-GI-FP16-NEXT:    mov w10, #-1 // =0xffffffff
-; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
 ; CHECK-GI-FP16-NEXT:    mov v3.s[0], w10
+; CHECK-GI-FP16-NEXT:    mov v4.s[3], w3
 ; CHECK-GI-FP16-NEXT:    sshr v0.4s, v0.4s, #31
 ; CHECK-GI-FP16-NEXT:    mov v1.s[2], w8
 ; CHECK-GI-FP16-NEXT:    mov v3.s[1], w10
diff --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign.ll
index 7f07b088182cae..6eb2d958540bef 100644
--- a/llvm/test/CodeGen/AArch64/fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/fcopysign.ll
@@ -213,46 +213,25 @@ define <7 x half> @copysign_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-GI-LABEL: copysign_v7f16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov v2.h[0], v0.h[0]
-; CHECK-GI-NEXT:    mov v3.h[0], v1.h[0]
 ; CHECK-GI-NEXT:    mov w8, #32768 // =0x8000
 ; CHECK-GI-NEXT:    mov w9, #32767 // =0x7fff
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    mov v2.h[1], v0.h[1]
-; CHECK-GI-NEXT:    mov v5.h[1], w8
-; CHECK-GI-NEXT:    mov v3.h[1], v1.h[1]
-; CHECK-GI-NEXT:    mov v4.h[1], w9
-; CHECK-GI-NEXT:    mov v2.h[2], v0.h[2]
-; CHECK-GI-NEXT:    mov v5.h[2], w8
-; CHECK-GI-NEXT:    mov v3.h[2], v1.h[2]
-; CHECK-GI-NEXT:    mov v4.h[2], w9
-; CHECK-GI-NEXT:    mov v2.h[3], v0.h[3]
-; CHECK-GI-NEXT:    mov v5.h[3], w8
-; CHECK-GI-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-GI-NEXT:    mov v4.h[3], w9
-; CHECK-GI-NEXT:    mov v2.h[4], v0.h[4]
-; CHECK-GI-NEXT:    mov v5.h[4], w8
-; CHECK-GI-NEXT:    mov v3.h[4], v1.h[4]
-; CHECK-GI-NEXT:    mov v4.h[4], w9
-; CHECK-GI-NEXT:    mov v2.h[5], v0.h[5]
-; CHECK-GI-NEXT:    mov v5.h[5], w8
-; CHECK-GI-NEXT:    mov v3.h[5], v1.h[5]
-; CHECK-GI-NEXT:    mov v4.h[5], w9
-; CHECK-GI-NEXT:    mov v2.h[6], v0.h[6]
-; CHECK-GI-NEXT:    mov v3.h[6], v1.h[6]
-; CHECK-GI-NEXT:    mov v5.h[6], w8
-; CHECK-GI-NEXT:    mov v4.h[6], w9
-; CHECK-GI-NEXT:    and v1.16b, v3.16b, v5.16b
-; CHECK-GI-NEXT:    and v0.16b, v2.16b, v4.16b
-; CHECK-GI-NEXT:    orr v1.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v2.h[1], w9
+; CHECK-GI-NEXT:    mov v3.h[1], w8
+; CHECK-GI-NEXT:    mov v2.h[2], w9
+; CHECK-GI-NEXT:    mov v3.h[2], w8
+; CHECK-GI-NEXT:    mov v2.h[3], w9
+; CHECK-GI-NEXT:    mov v3.h[3], w8
+; CHECK-GI-NEXT:    mov v2.h[4], w9
+; CHECK-GI-NEXT:    mov v3.h[4], w8
+; CHECK-GI-NEXT:    mov v2.h[5], w9
+; CHECK-GI-NEXT:    mov v3.h[5], w8
+; CHECK-GI-NEXT:    mov v2.h[6], w9
+; CHECK-GI-NEXT:    mov v3.h[6], w8
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.copysign.v7f16(<7 x half> %a, <7 x half> %b)
diff --git a/llvm/test/CodeGen/AArch64/fcvt.ll b/llvm/test/CodeGen/AArch64/fcvt.ll
index 55d9984c6392f5..15a8f0557cc417 100644
--- a/llvm/test/CodeGen/AArch64/fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt.ll
@@ -175,52 +175,27 @@ define <7 x half> @ceil_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: ceil_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    frintp v0.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    frintp v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[4], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[5], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[6], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v3.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v3.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v3.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v3.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    frintp v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: ceil_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    mov v1.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT:    mov v1.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT:    mov v1.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT:    mov v1.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT:    mov v1.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT:    frintp v1.8h, v1.8h
-; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT:    frintp v0.8h, v0.8h
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.ceil.v7f16(<7 x half> %a)
@@ -511,52 +486,27 @@ define <7 x half> @floor_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: floor_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    frintm v0.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    frintm v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[4], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[5], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[6], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v3.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v3.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v3.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v3.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    frintm v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: floor_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    mov v1.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT:    mov v1.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT:    mov v1.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT:    mov v1.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT:    mov v1.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT:    frintm v1.8h, v1.8h
-; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT:    frintm v0.8h, v0.8h
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.floor.v7f16(<7 x half> %a)
@@ -847,52 +797,27 @@ define <7 x half> @nearbyint_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: nearbyint_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    frinti v0.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    frinti v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[4], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[5], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[6], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v3.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v3.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v3.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v3.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    frinti v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: nearbyint_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    mov v1.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT:    mov v1.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT:    mov v1.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT:    mov v1.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT:    mov v1.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT:    frinti v1.8h, v1.8h
-; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT:    frinti v0.8h, v0.8h
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.nearbyint.v7f16(<7 x half> %a)
@@ -1183,52 +1108,27 @@ define <7 x half> @roundeven_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: roundeven_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    frintn v0.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    frintn v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[4], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[5], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[6], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v3.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v3.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v3.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v3.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    frintn v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: roundeven_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    mov v1.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT:    mov v1.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT:    mov v1.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT:    mov v1.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT:    mov v1.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT:    frintn v1.8h, v1.8h
-; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT:    frintn v0.8h, v0.8h
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.roundeven.v7f16(<7 x half> %a)
@@ -1519,52 +1419,27 @@ define <7 x half> @rint_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: rint_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    frintx v0.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    frintx v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[4], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[5], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[6], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v3.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v3.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v3.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v3.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    frintx v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: rint_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    mov v1.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT:    mov v1.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT:    mov v1.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT:    mov v1.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT:    mov v1.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT:    frintx v1.8h, v1.8h
-; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT:    frintx v0.8h, v0.8h
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.rint.v7f16(<7 x half> %a)
@@ -1855,52 +1730,27 @@ define <7 x half> @round_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: round_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    frinta v0.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    frinta v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[4], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[5], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[6], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v3.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v3.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v3.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v3.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    frinta v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: round_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    mov v1.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT:    mov v1.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT:    mov v1.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT:    mov v1.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT:    mov v1.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT:    frinta v1.8h, v1.8h
-; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT:    frinta v0.8h, v0.8h
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.round.v7f16(<7 x half> %a)
@@ -2191,52 +2041,27 @@ define <7 x half> @trunc_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: trunc_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    frintz v0.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    frintz v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[4], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[5], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[6], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v3.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v3.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v3.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v3.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    frintz v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: trunc_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    mov v1.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT:    mov v1.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT:    mov v1.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT:    mov v1.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT:    mov v1.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT:    frintz v1.8h, v1.8h
-; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT:    frintz v0.8h, v0.8h
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.trunc.v7f16(<7 x half> %a)
diff --git a/llvm/test/CodeGen/AArch64/fdiv.ll b/llvm/test/CodeGen/AArch64/fdiv.ll
index 9acd0166fcaa85..82ce3af7e614f1 100644
--- a/llvm/test/CodeGen/AArch64/fdiv.ll
+++ b/llvm/test/CodeGen/AArch64/fdiv.ll
@@ -201,68 +201,32 @@ define <7 x half> @fdiv_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fdiv_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    fdiv v2.4s, v2.4s, v3.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    fdiv v0.4s, v0.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[4], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[5], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[6], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    fdiv v1.4s, v0.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fdiv_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    mov v2.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v3.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v2.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov v3.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov v2.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT:    mov v3.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT:    mov v2.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT:    mov v2.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT:    mov v3.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT:    mov v2.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT:    mov v3.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT:    mov v2.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT:    mov v3.h[6], v1.h[6]
-; CHECK-GI-FP16-NEXT:    fdiv v1.8h, v2.8h, v3.8h
-; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT:    fdiv v0.8h, v0.8h, v1.8h
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fdiv <7 x half> %a, %b
diff --git a/llvm/test/CodeGen/AArch64/fexplog.ll b/llvm/test/CodeGen/AArch64/fexplog.ll
index 6072a2c56a06d1..08068ac4f10881 100644
--- a/llvm/test/CodeGen/AArch64/fexplog.ll
+++ b/llvm/test/CodeGen/AArch64/fexplog.ll
@@ -732,13 +732,7 @@ define <7 x half> @exp_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -2047,13 +2041,7 @@ define <7 x half> @exp2_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -3362,13 +3350,7 @@ define <7 x half> @log_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -4677,13 +4659,7 @@ define <7 x half> @log2_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -5992,13 +5968,7 @@ define <7 x half> @log10_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/fminimummaximum.ll b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
index c2e91a9956af91..e8201f62599b75 100644
--- a/llvm/test/CodeGen/AArch64/fminimummaximum.ll
+++ b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
@@ -692,68 +692,32 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-NOFP16-GI-LABEL: min_v7f16:
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[0], v0.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v1.4h
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[0], v0.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[1], v0.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov v3.h[1], v1.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v0.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[2], v0.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[3], v0.h[3]
-; CHECK-NOFP16-GI-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v0.4s, v4.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    fmin v2.4s, v2.4s, v3.4s
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[0], v1.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v0.h[5]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[1], v1.h[5]
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[1], v2.h[1]
-; CHECK-NOFP16-GI-NEXT:    fmin v0.4s, v0.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[2], v2.h[2]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[3], v2.h[3]
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[4], v0.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[5], v0.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[6], v0.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT:    fmin v1.4s, v1.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v2.h[2]
+; CHECK-NOFP16-GI-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v2.h[3]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: min_v7f16:
 ; CHECK-FP16-GI:       // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT:    mov v2.h[0], v0.h[0]
-; CHECK-FP16-GI-NEXT:    mov v3.h[0], v1.h[0]
-; CHECK-FP16-GI-NEXT:    mov v2.h[1], v0.h[1]
-; CHECK-FP16-GI-NEXT:    mov v3.h[1], v1.h[1]
-; CHECK-FP16-GI-NEXT:    mov v2.h[2], v0.h[2]
-; CHECK-FP16-GI-NEXT:    mov v3.h[2], v1.h[2]
-; CHECK-FP16-GI-NEXT:    mov v2.h[3], v0.h[3]
-; CHECK-FP16-GI-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-FP16-GI-NEXT:    mov v2.h[4], v0.h[4]
-; CHECK-FP16-GI-NEXT:    mov v3.h[4], v1.h[4]
-; CHECK-FP16-GI-NEXT:    mov v2.h[5], v0.h[5]
-; CHECK-FP16-GI-NEXT:    mov v3.h[5], v1.h[5]
-; CHECK-FP16-GI-NEXT:    mov v2.h[6], v0.h[6]
-; CHECK-FP16-GI-NEXT:    mov v3.h[6], v1.h[6]
-; CHECK-FP16-GI-NEXT:    fmin v1.8h, v2.8h, v3.8h
-; CHECK-FP16-GI-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-FP16-GI-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-FP16-GI-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-FP16-GI-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-FP16-GI-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-FP16-GI-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-FP16-GI-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-FP16-GI-NEXT:    fmin v0.8h, v0.8h, v1.8h
 ; CHECK-FP16-GI-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.minimum.v7f16(<7 x half> %a, <7 x half> %b)
@@ -826,68 +790,32 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-NOFP16-GI-LABEL: max_v7f16:
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[0], v0.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v1.4h
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[0], v0.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[1], v0.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov v3.h[1], v1.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v0.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[2], v0.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[3], v0.h[3]
-; CHECK-NOFP16-GI-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v0.4s, v4.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    fmax v2.4s, v2.4s, v3.4s
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[0], v1.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v0.h[5]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[1], v1.h[5]
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[1], v2.h[1]
-; CHECK-NOFP16-GI-NEXT:    fmax v0.4s, v0.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[2], v2.h[2]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[3], v2.h[3]
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[4], v0.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[5], v0.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[6], v0.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT:    fmax v1.4s, v1.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v2.h[2]
+; CHECK-NOFP16-GI-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v2.h[3]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: max_v7f16:
 ; CHECK-FP16-GI:       // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT:    mov v2.h[0], v0.h[0]
-; CHECK-FP16-GI-NEXT:    mov v3.h[0], v1.h[0]
-; CHECK-FP16-GI-NEXT:    mov v2.h[1], v0.h[1]
-; CHECK-FP16-GI-NEXT:    mov v3.h[1], v1.h[1]
-; CHECK-FP16-GI-NEXT:    mov v2.h[2], v0.h[2]
-; CHECK-FP16-GI-NEXT:    mov v3.h[2], v1.h[2]
-; CHECK-FP16-GI-NEXT:    mov v2.h[3], v0.h[3]
-; CHECK-FP16-GI-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-FP16-GI-NEXT:    mov v2.h[4], v0.h[4]
-; CHECK-FP16-GI-NEXT:    mov v3.h[4], v1.h[4]
-; CHECK-FP16-GI-NEXT:    mov v2.h[5], v0.h[5]
-; CHECK-FP16-GI-NEXT:    mov v3.h[5], v1.h[5]
-; CHECK-FP16-GI-NEXT:    mov v2.h[6], v0.h[6]
-; CHECK-FP16-GI-NEXT:    mov v3.h[6], v1.h[6]
-; CHECK-FP16-GI-NEXT:    fmax v1.8h, v2.8h, v3.8h
-; CHECK-FP16-GI-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-FP16-GI-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-FP16-GI-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-FP16-GI-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-FP16-GI-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-FP16-GI-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-FP16-GI-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-FP16-GI-NEXT:    fmax v0.8h, v0.8h, v1.8h
 ; CHECK-FP16-GI-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.maximum.v7f16(<7 x half> %a, <7 x half> %b)
diff --git a/llvm/test/CodeGen/AArch64/fminmax.ll b/llvm/test/CodeGen/AArch64/fminmax.ll
index b7af6be8721d68..8a613907807c4f 100644
--- a/llvm/test/CodeGen/AArch64/fminmax.ll
+++ b/llvm/test/CodeGen/AArch64/fminmax.ll
@@ -692,68 +692,32 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-NOFP16-GI-LABEL: min_v7f16:
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[0], v0.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v1.4h
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[0], v0.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[1], v0.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov v3.h[1], v1.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v0.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[2], v0.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[3], v0.h[3]
-; CHECK-NOFP16-GI-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v0.4s, v4.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    fminnm v2.4s, v2.4s, v3.4s
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[0], v1.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v0.h[5]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[1], v1.h[5]
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[1], v2.h[1]
-; CHECK-NOFP16-GI-NEXT:    fminnm v0.4s, v0.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[2], v2.h[2]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[3], v2.h[3]
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[4], v0.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[5], v0.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[6], v0.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT:    fminnm v1.4s, v1.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v2.h[2]
+; CHECK-NOFP16-GI-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v2.h[3]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: min_v7f16:
 ; CHECK-FP16-GI:       // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT:    mov v2.h[0], v0.h[0]
-; CHECK-FP16-GI-NEXT:    mov v3.h[0], v1.h[0]
-; CHECK-FP16-GI-NEXT:    mov v2.h[1], v0.h[1]
-; CHECK-FP16-GI-NEXT:    mov v3.h[1], v1.h[1]
-; CHECK-FP16-GI-NEXT:    mov v2.h[2], v0.h[2]
-; CHECK-FP16-GI-NEXT:    mov v3.h[2], v1.h[2]
-; CHECK-FP16-GI-NEXT:    mov v2.h[3], v0.h[3]
-; CHECK-FP16-GI-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-FP16-GI-NEXT:    mov v2.h[4], v0.h[4]
-; CHECK-FP16-GI-NEXT:    mov v3.h[4], v1.h[4]
-; CHECK-FP16-GI-NEXT:    mov v2.h[5], v0.h[5]
-; CHECK-FP16-GI-NEXT:    mov v3.h[5], v1.h[5]
-; CHECK-FP16-GI-NEXT:    mov v2.h[6], v0.h[6]
-; CHECK-FP16-GI-NEXT:    mov v3.h[6], v1.h[6]
-; CHECK-FP16-GI-NEXT:    fminnm v1.8h, v2.8h, v3.8h
-; CHECK-FP16-GI-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-FP16-GI-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-FP16-GI-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-FP16-GI-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-FP16-GI-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-FP16-GI-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-FP16-GI-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-FP16-GI-NEXT:    fminnm v0.8h, v0.8h, v1.8h
 ; CHECK-FP16-GI-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.minnum.v7f16(<7 x half> %a, <7 x half> %b)
@@ -826,68 +790,32 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-NOFP16-GI-LABEL: max_v7f16:
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[0], v0.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v1.4h
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[0], v0.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[1], v0.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov v3.h[1], v1.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v0.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[2], v0.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[3], v0.h[3]
-; CHECK-NOFP16-GI-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v0.4s, v4.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    fmaxnm v2.4s, v2.4s, v3.4s
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[0], v1.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v0.h[5]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[1], v1.h[5]
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[1], v2.h[1]
-; CHECK-NOFP16-GI-NEXT:    fmaxnm v0.4s, v0.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[2], v2.h[2]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[3], v2.h[3]
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[4], v0.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[5], v0.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov v1.h[6], v0.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT:    fmaxnm v1.4s, v1.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v2.h[2]
+; CHECK-NOFP16-GI-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v2.h[3]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: max_v7f16:
 ; CHECK-FP16-GI:       // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT:    mov v2.h[0], v0.h[0]
-; CHECK-FP16-GI-NEXT:    mov v3.h[0], v1.h[0]
-; CHECK-FP16-GI-NEXT:    mov v2.h[1], v0.h[1]
-; CHECK-FP16-GI-NEXT:    mov v3.h[1], v1.h[1]
-; CHECK-FP16-GI-NEXT:    mov v2.h[2], v0.h[2]
-; CHECK-FP16-GI-NEXT:    mov v3.h[2], v1.h[2]
-; CHECK-FP16-GI-NEXT:    mov v2.h[3], v0.h[3]
-; CHECK-FP16-GI-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-FP16-GI-NEXT:    mov v2.h[4], v0.h[4]
-; CHECK-FP16-GI-NEXT:    mov v3.h[4], v1.h[4]
-; CHECK-FP16-GI-NEXT:    mov v2.h[5], v0.h[5]
-; CHECK-FP16-GI-NEXT:    mov v3.h[5], v1.h[5]
-; CHECK-FP16-GI-NEXT:    mov v2.h[6], v0.h[6]
-; CHECK-FP16-GI-NEXT:    mov v3.h[6], v1.h[6]
-; CHECK-FP16-GI-NEXT:    fmaxnm v1.8h, v2.8h, v3.8h
-; CHECK-FP16-GI-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-FP16-GI-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-FP16-GI-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-FP16-GI-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-FP16-GI-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-FP16-GI-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-FP16-GI-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-FP16-GI-NEXT:    fmaxnm v0.8h, v0.8h, v1.8h
 ; CHECK-FP16-GI-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.maxnum.v7f16(<7 x half> %a, <7 x half> %b)
diff --git a/llvm/test/CodeGen/AArch64/fmla.ll b/llvm/test/CodeGen/AArch64/fmla.ll
index 0a9d4c7b657e06..7ed9425ed42e90 100644
--- a/llvm/test/CodeGen/AArch64/fmla.ll
+++ b/llvm/test/CodeGen/AArch64/fmla.ll
@@ -272,84 +272,38 @@ define <7 x half> @fma_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fma_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v6.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v6.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v5.4h
 ; CHECK-GI-NOFP16-NEXT:    fmla v5.4s, v4.4s, v3.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[4]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v2.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v2.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v5.4h, v5.4s
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v2.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v6.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v5.h[1]
-; CHECK-GI-NOFP16-NEXT:    fmla v3.4s, v2.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v5.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v5.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[4], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[5], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[6], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v5.h[1]
+; CHECK-GI-NOFP16-NEXT:    fmla v3.4s, v2.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v5.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v5.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fma_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    mov v3.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v4.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v5.h[0], v2.h[0]
-; CHECK-GI-FP16-NEXT:    mov v3.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov v4.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov v5.h[1], v2.h[1]
-; CHECK-GI-FP16-NEXT:    mov v3.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT:    mov v4.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT:    mov v5.h[2], v2.h[2]
-; CHECK-GI-FP16-NEXT:    mov v3.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT:    mov v4.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT:    mov v5.h[3], v2.h[3]
-; CHECK-GI-FP16-NEXT:    mov v3.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT:    mov v4.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT:    mov v5.h[4], v2.h[4]
-; CHECK-GI-FP16-NEXT:    mov v3.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT:    mov v4.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT:    mov v5.h[5], v2.h[5]
-; CHECK-GI-FP16-NEXT:    mov v3.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT:    mov v4.h[6], v1.h[6]
-; CHECK-GI-FP16-NEXT:    mov v5.h[6], v2.h[6]
-; CHECK-GI-FP16-NEXT:    fmla v5.8h, v4.8h, v3.8h
-; CHECK-GI-FP16-NEXT:    mov v0.h[0], v5.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v5.h[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v5.h[2]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v5.h[3]
-; CHECK-GI-FP16-NEXT:    mov v0.h[4], v5.h[4]
-; CHECK-GI-FP16-NEXT:    mov v0.h[5], v5.h[5]
-; CHECK-GI-FP16-NEXT:    mov v0.h[6], v5.h[6]
+; CHECK-GI-FP16-NEXT:    fmla v2.8h, v1.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    mov v0.16b, v2.16b
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %d = call <7 x half> @llvm.fma.v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c)
@@ -934,90 +888,44 @@ define <7 x half> @fmuladd_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fmuladd_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[0], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fmul v3.4s, v3.4s, v4.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v5.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v6.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v5.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v5.h[0], v2.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    fadd v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v2.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fmul v1.4s, v3.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v2.h[6]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v5.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v1.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
+; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v1.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[5], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[6], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v3.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fmuladd_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    mov v3.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v4.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v5.h[0], v2.h[0]
-; CHECK-GI-FP16-NEXT:    mov v3.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov v4.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov v5.h[1], v2.h[1]
-; CHECK-GI-FP16-NEXT:    mov v3.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT:    mov v4.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT:    mov v5.h[2], v2.h[2]
-; CHECK-GI-FP16-NEXT:    mov v3.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT:    mov v4.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT:    mov v5.h[3], v2.h[3]
-; CHECK-GI-FP16-NEXT:    mov v3.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT:    mov v4.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT:    mov v5.h[4], v2.h[4]
-; CHECK-GI-FP16-NEXT:    mov v3.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT:    mov v4.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT:    mov v5.h[5], v2.h[5]
-; CHECK-GI-FP16-NEXT:    mov v3.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT:    mov v4.h[6], v1.h[6]
-; CHECK-GI-FP16-NEXT:    mov v5.h[6], v2.h[6]
-; CHECK-GI-FP16-NEXT:    fmla v5.8h, v4.8h, v3.8h
-; CHECK-GI-FP16-NEXT:    mov v0.h[0], v5.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v5.h[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v5.h[2]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v5.h[3]
-; CHECK-GI-FP16-NEXT:    mov v0.h[4], v5.h[4]
-; CHECK-GI-FP16-NEXT:    mov v0.h[5], v5.h[5]
-; CHECK-GI-FP16-NEXT:    mov v0.h[6], v5.h[6]
+; CHECK-GI-FP16-NEXT:    fmla v2.8h, v1.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    mov v0.16b, v2.16b
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %d = call <7 x half> @llvm.fmuladd.v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c)
@@ -1480,90 +1388,44 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fmul_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[0], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fmul v3.4s, v3.4s, v4.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v5.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v6.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v5.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v5.h[0], v2.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    fadd v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v2.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fmul v1.4s, v3.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v2.h[6]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v5.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v1.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
+; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v1.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[5], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[6], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v3.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fmul_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    mov v3.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v4.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v5.h[0], v2.h[0]
-; CHECK-GI-FP16-NEXT:    mov v3.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov v4.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov v5.h[1], v2.h[1]
-; CHECK-GI-FP16-NEXT:    mov v3.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT:    mov v4.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT:    mov v5.h[2], v2.h[2]
-; CHECK-GI-FP16-NEXT:    mov v3.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT:    mov v4.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT:    mov v5.h[3], v2.h[3]
-; CHECK-GI-FP16-NEXT:    mov v3.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT:    mov v4.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT:    mov v5.h[4], v2.h[4]
-; CHECK-GI-FP16-NEXT:    mov v3.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT:    mov v4.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT:    mov v5.h[5], v2.h[5]
-; CHECK-GI-FP16-NEXT:    mov v3.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT:    mov v4.h[6], v1.h[6]
-; CHECK-GI-FP16-NEXT:    mov v5.h[6], v2.h[6]
-; CHECK-GI-FP16-NEXT:    fmla v5.8h, v3.8h, v4.8h
-; CHECK-GI-FP16-NEXT:    mov v0.h[0], v5.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v5.h[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v5.h[2]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v5.h[3]
-; CHECK-GI-FP16-NEXT:    mov v0.h[4], v5.h[4]
-; CHECK-GI-FP16-NEXT:    mov v0.h[5], v5.h[5]
-; CHECK-GI-FP16-NEXT:    mov v0.h[6], v5.h[6]
+; CHECK-GI-FP16-NEXT:    fmla v2.8h, v0.8h, v1.8h
+; CHECK-GI-FP16-NEXT:    mov v0.16b, v2.16b
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %d = fmul fast <7 x half> %a, %b
diff --git a/llvm/test/CodeGen/AArch64/fmul.ll b/llvm/test/CodeGen/AArch64/fmul.ll
index de6618ac18f157..f045c5ab96c4e6 100644
--- a/llvm/test/CodeGen/AArch64/fmul.ll
+++ b/llvm/test/CodeGen/AArch64/fmul.ll
@@ -201,68 +201,32 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fmul_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    fmul v2.4s, v2.4s, v3.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    fmul v0.4s, v0.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[4], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[5], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[6], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    fmul v1.4s, v1.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fmul_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    mov v2.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v3.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v2.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov v3.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov v2.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT:    mov v3.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT:    mov v2.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT:    mov v2.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT:    mov v3.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT:    mov v2.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT:    mov v3.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT:    mov v2.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT:    mov v3.h[6], v1.h[6]
-; CHECK-GI-FP16-NEXT:    fmul v1.8h, v2.8h, v3.8h
-; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT:    fmul v0.8h, v0.8h, v1.8h
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fmul <7 x half> %a, %b
diff --git a/llvm/test/CodeGen/AArch64/fneg.ll b/llvm/test/CodeGen/AArch64/fneg.ll
index dd6266e8b3b1f4..bcd4bcf4c2b0b9 100644
--- a/llvm/test/CodeGen/AArch64/fneg.ll
+++ b/llvm/test/CodeGen/AArch64/fneg.ll
@@ -175,41 +175,13 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fabs_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    movi v1.8h, #128, lsl #8
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[4], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[5], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[6], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v2.16b, v1.16b
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    eor v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fabs_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    mov v1.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT:    mov v1.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT:    mov v1.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT:    mov v1.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT:    mov v1.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT:    fneg v1.8h, v1.8h
-; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT:    fneg v0.8h, v0.8h
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fneg <7 x half> %a
diff --git a/llvm/test/CodeGen/AArch64/fpow.ll b/llvm/test/CodeGen/AArch64/fpow.ll
index fb7efe82582322..08589d647d189c 100644
--- a/llvm/test/CodeGen/AArch64/fpow.ll
+++ b/llvm/test/CodeGen/AArch64/fpow.ll
@@ -885,13 +885,7 @@ define <7 x half> @pow_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NEXT:    ldr q2, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #176
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/fpowi.ll b/llvm/test/CodeGen/AArch64/fpowi.ll
index 3f122ee06d99a9..af81d5fa5bf6fd 100644
--- a/llvm/test/CodeGen/AArch64/fpowi.ll
+++ b/llvm/test/CodeGen/AArch64/fpowi.ll
@@ -815,13 +815,7 @@ define <7 x half> @powi_v7f16(<7 x half> %a, i32 %b) {
 ; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index a7c51ea2b9ace1..8dae8328f3ceb6 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -397,35 +397,36 @@ define <5 x i32> @test_signed_v5f64_v5i32(<5 x double> %f) {
 ; CHECK-GI-NEXT:    // kill: def $d4 killed $d4 def $q4
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v2.d[1], v3.d[0]
-; CHECK-GI-NEXT:    fcvtzs v4.2d, v4.2d
+; CHECK-GI-NEXT:    fcvtzs v3.2d, v4.2d
 ; CHECK-GI-NEXT:    fcvtzs v0.2d, v0.2d
 ; CHECK-GI-NEXT:    fcvtzs v1.2d, v2.2d
 ; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI12_1]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI12_0
-; CHECK-GI-NEXT:    cmgt v3.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    cmgt v4.2d, v2.2d, v0.2d
 ; CHECK-GI-NEXT:    cmgt v5.2d, v2.2d, v1.2d
-; CHECK-GI-NEXT:    bif v0.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    bif v0.16b, v2.16b, v4.16b
 ; CHECK-GI-NEXT:    bif v1.16b, v2.16b, v5.16b
-; CHECK-GI-NEXT:    cmgt v5.2d, v2.2d, v4.2d
-; CHECK-GI-NEXT:    ldr q3, [x8, :lo12:.LCPI12_0]
-; CHECK-GI-NEXT:    bit v2.16b, v4.16b, v5.16b
-; CHECK-GI-NEXT:    cmgt v6.2d, v0.2d, v3.2d
-; CHECK-GI-NEXT:    cmgt v7.2d, v1.2d, v3.2d
-; CHECK-GI-NEXT:    bif v0.16b, v3.16b, v6.16b
-; CHECK-GI-NEXT:    bif v1.16b, v3.16b, v7.16b
 ; CHECK-GI-NEXT:    cmgt v4.2d, v2.2d, v3.2d
-; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    mov v1.16b, v4.16b
-; CHECK-GI-NEXT:    bsl v1.16b, v2.16b, v3.16b
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov s3, v0.s[2]
-; CHECK-GI-NEXT:    mov s4, v0.s[3]
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    mov v1.s[0], v1.s[0]
-; CHECK-GI-NEXT:    fmov w1, s2
-; CHECK-GI-NEXT:    fmov w2, s3
-; CHECK-GI-NEXT:    fmov w3, s4
-; CHECK-GI-NEXT:    fmov w4, s1
+; CHECK-GI-NEXT:    ldr q5, [x8, :lo12:.LCPI12_0]
+; CHECK-GI-NEXT:    bit v2.16b, v3.16b, v4.16b
+; CHECK-GI-NEXT:    cmgt v3.2d, v0.2d, v5.2d
+; CHECK-GI-NEXT:    cmgt v4.2d, v1.2d, v5.2d
+; CHECK-GI-NEXT:    bif v0.16b, v5.16b, v3.16b
+; CHECK-GI-NEXT:    bif v1.16b, v5.16b, v4.16b
+; CHECK-GI-NEXT:    cmgt v3.2d, v2.2d, v5.2d
+; CHECK-GI-NEXT:    bif v2.16b, v5.16b, v3.16b
+; CHECK-GI-NEXT:    mov d3, v0.d[1]
+; CHECK-GI-NEXT:    mov d4, v1.d[1]
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    fmov x2, d1
+; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 killed $x2
+; CHECK-GI-NEXT:    fmov x4, d2
+; CHECK-GI-NEXT:    fmov x1, d3
+; CHECK-GI-NEXT:    fmov x3, d4
+; CHECK-GI-NEXT:    // kill: def $w4 killed $w4 killed $x4
+; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 killed $x1
+; CHECK-GI-NEXT:    // kill: def $w3 killed $w3 killed $x3
 ; CHECK-GI-NEXT:    ret
     %x = call <5 x i32> @llvm.fptosi.sat.v5f64.v5i32(<5 x double> %f)
     ret <5 x i32> %x
@@ -444,49 +445,49 @@ define <6 x i32> @test_signed_v6f64_v6i32(<6 x double> %f) {
 ;
 ; CHECK-GI-LABEL: test_signed_v6f64_v6i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d4 killed $d4 def $q4
-; CHECK-GI-NEXT:    // kill: def $d5 killed $d5 def $q5
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-GI-NEXT:    // kill: def $d4 killed $d4 def $q4
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-GI-NEXT:    // kill: def $d5 killed $d5 def $q5
 ; CHECK-GI-NEXT:    adrp x8, .LCPI13_1
-; CHECK-GI-NEXT:    mov v4.d[1], v5.d[0]
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v2.d[1], v3.d[0]
+; CHECK-GI-NEXT:    mov v4.d[1], v5.d[0]
 ; CHECK-GI-NEXT:    ldr q3, [x8, :lo12:.LCPI13_1]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI13_0
-; CHECK-GI-NEXT:    ldr q6, [x8, :lo12:.LCPI13_0]
-; CHECK-GI-NEXT:    fcvtzs v1.2d, v4.2d
 ; CHECK-GI-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-GI-NEXT:    fcvtzs v2.2d, v2.2d
-; CHECK-GI-NEXT:    cmgt v4.2d, v3.2d, v1.2d
-; CHECK-GI-NEXT:    cmgt v5.2d, v3.2d, v2.2d
-; CHECK-GI-NEXT:    bif v1.16b, v3.16b, v4.16b
+; CHECK-GI-NEXT:    fcvtzs v1.2d, v2.2d
+; CHECK-GI-NEXT:    fcvtzs v2.2d, v4.2d
 ; CHECK-GI-NEXT:    cmgt v4.2d, v3.2d, v0.2d
-; CHECK-GI-NEXT:    bif v2.16b, v3.16b, v5.16b
+; CHECK-GI-NEXT:    cmgt v5.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT:    cmgt v6.2d, v3.2d, v2.2d
 ; CHECK-GI-NEXT:    bif v0.16b, v3.16b, v4.16b
-; CHECK-GI-NEXT:    cmgt v3.2d, v1.2d, v6.2d
-; CHECK-GI-NEXT:    cmgt v4.2d, v2.2d, v6.2d
-; CHECK-GI-NEXT:    bif v1.16b, v6.16b, v3.16b
-; CHECK-GI-NEXT:    cmgt v3.2d, v0.2d, v6.2d
-; CHECK-GI-NEXT:    bif v2.16b, v6.16b, v4.16b
-; CHECK-GI-NEXT:    bif v0.16b, v6.16b, v3.16b
-; CHECK-GI-NEXT:    mov d3, v1.d[1]
-; CHECK-GI-NEXT:    mov v1.s[0], v1.s[0]
-; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v2.4s
-; CHECK-GI-NEXT:    fmov x8, d3
-; CHECK-GI-NEXT:    mov v1.s[1], w8
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov s3, v0.s[2]
-; CHECK-GI-NEXT:    mov s4, v0.s[3]
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    mov s5, v1.s[1]
-; CHECK-GI-NEXT:    fmov w1, s2
-; CHECK-GI-NEXT:    fmov w2, s3
-; CHECK-GI-NEXT:    fmov w3, s4
-; CHECK-GI-NEXT:    fmov w4, s1
-; CHECK-GI-NEXT:    fmov w5, s5
+; CHECK-GI-NEXT:    bif v1.16b, v3.16b, v5.16b
+; CHECK-GI-NEXT:    bif v2.16b, v3.16b, v6.16b
+; CHECK-GI-NEXT:    ldr q3, [x8, :lo12:.LCPI13_0]
+; CHECK-GI-NEXT:    cmgt v4.2d, v0.2d, v3.2d
+; CHECK-GI-NEXT:    cmgt v5.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    cmgt v6.2d, v2.2d, v3.2d
+; CHECK-GI-NEXT:    bif v0.16b, v3.16b, v4.16b
+; CHECK-GI-NEXT:    bif v1.16b, v3.16b, v5.16b
+; CHECK-GI-NEXT:    bif v2.16b, v3.16b, v6.16b
+; CHECK-GI-NEXT:    mov d3, v0.d[1]
+; CHECK-GI-NEXT:    mov d4, v1.d[1]
+; CHECK-GI-NEXT:    mov d5, v2.d[1]
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    fmov x2, d1
+; CHECK-GI-NEXT:    fmov x4, d2
+; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 killed $x2
+; CHECK-GI-NEXT:    // kill: def $w4 killed $w4 killed $x4
+; CHECK-GI-NEXT:    fmov x1, d3
+; CHECK-GI-NEXT:    fmov x3, d4
+; CHECK-GI-NEXT:    fmov x5, d5
+; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 killed $x1
+; CHECK-GI-NEXT:    // kill: def $w3 killed $w3 killed $x3
+; CHECK-GI-NEXT:    // kill: def $w5 killed $w5 killed $x5
 ; CHECK-GI-NEXT:    ret
     %x = call <6 x i32> @llvm.fptosi.sat.v6f64.v6i32(<6 x double> %f)
     ret <6 x i32> %x
@@ -1285,22 +1286,18 @@ define <5 x i32> @test_signed_v5f16_v5i32(<5 x half> %f) {
 ;
 ; CHECK-GI-LABEL: test_signed_v5f16_v5i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
-; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
-; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT:    fcvtl v1.4s, v0.4h
 ; CHECK-GI-NEXT:    mov v0.h[0], v0.h[4]
-; CHECK-GI-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NEXT:    fcvtzs v1.4s, v1.4s
-; CHECK-GI-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-GI-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NEXT:    mov s2, v1.s[1]
+; CHECK-GI-NEXT:    fcvtzs v0.4s, v0.4s
 ; CHECK-GI-NEXT:    mov s3, v1.s[2]
 ; CHECK-GI-NEXT:    mov s4, v1.s[3]
 ; CHECK-GI-NEXT:    fmov w0, s1
-; CHECK-GI-NEXT:    fmov w4, s0
 ; CHECK-GI-NEXT:    fmov w1, s2
 ; CHECK-GI-NEXT:    fmov w2, s3
+; CHECK-GI-NEXT:    fmov w4, s0
 ; CHECK-GI-NEXT:    fmov w3, s4
 ; CHECK-GI-NEXT:    ret
     %x = call <5 x i32> @llvm.fptosi.sat.v5f16.v5i32(<5 x half> %f)
@@ -1324,26 +1321,22 @@ define <6 x i32> @test_signed_v6f16_v6i32(<6 x half> %f) {
 ;
 ; CHECK-GI-LABEL: test_signed_v6f16_v6i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
-; CHECK-GI-NEXT:    mov v2.h[0], v0.h[4]
-; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
-; CHECK-GI-NEXT:    mov v2.h[1], v0.h[5]
-; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
-; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
-; CHECK-GI-NEXT:    fcvtl v0.4s, v1.4h
-; CHECK-GI-NEXT:    fcvtl v1.4s, v2.4h
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[4]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[5]
+; CHECK-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-GI-NEXT:    fcvtl v1.4s, v1.4h
 ; CHECK-GI-NEXT:    fcvtzs v0.4s, v0.4s
 ; CHECK-GI-NEXT:    fcvtzs v1.4s, v1.4s
 ; CHECK-GI-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-NEXT:    mov s3, v0.s[2]
-; CHECK-GI-NEXT:    mov s4, v1.s[1]
-; CHECK-GI-NEXT:    mov s5, v0.s[3]
+; CHECK-GI-NEXT:    mov s4, v0.s[3]
 ; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    fmov w4, s1
+; CHECK-GI-NEXT:    mov s5, v1.s[1]
 ; CHECK-GI-NEXT:    fmov w1, s2
 ; CHECK-GI-NEXT:    fmov w2, s3
-; CHECK-GI-NEXT:    fmov w5, s4
-; CHECK-GI-NEXT:    fmov w3, s5
+; CHECK-GI-NEXT:    fmov w3, s4
+; CHECK-GI-NEXT:    fmov w4, s1
+; CHECK-GI-NEXT:    fmov w5, s5
 ; CHECK-GI-NEXT:    ret
     %x = call <6 x i32> @llvm.fptosi.sat.v6f16.v6i32(<6 x half> %f)
     ret <6 x i32> %x
@@ -1367,27 +1360,23 @@ define <7 x i32> @test_signed_v7f16_v7i32(<7 x half> %f) {
 ;
 ; CHECK-GI-LABEL: test_signed_v7f16_v7i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
-; CHECK-GI-NEXT:    mov v2.h[0], v0.h[4]
-; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
-; CHECK-GI-NEXT:    mov v2.h[1], v0.h[5]
-; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
-; CHECK-GI-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
-; CHECK-GI-NEXT:    fcvtl v0.4s, v1.4h
-; CHECK-GI-NEXT:    fcvtl v1.4s, v2.4h
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[4]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[5]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[6]
+; CHECK-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-GI-NEXT:    fcvtl v1.4s, v1.4h
 ; CHECK-GI-NEXT:    fcvtzs v0.4s, v0.4s
 ; CHECK-GI-NEXT:    fcvtzs v1.4s, v1.4s
 ; CHECK-GI-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-NEXT:    mov s3, v0.s[2]
 ; CHECK-GI-NEXT:    mov s4, v0.s[3]
+; CHECK-GI-NEXT:    fmov w0, s0
 ; CHECK-GI-NEXT:    mov s5, v1.s[1]
 ; CHECK-GI-NEXT:    mov s6, v1.s[2]
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    fmov w4, s1
 ; CHECK-GI-NEXT:    fmov w1, s2
 ; CHECK-GI-NEXT:    fmov w2, s3
 ; CHECK-GI-NEXT:    fmov w3, s4
+; CHECK-GI-NEXT:    fmov w4, s1
 ; CHECK-GI-NEXT:    fmov w5, s5
 ; CHECK-GI-NEXT:    fmov w6, s6
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index eb68125080f33a..a86c41a7b7edd7 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -378,25 +378,27 @@ define <5 x i32> @test_unsigned_v5f64_v5i32(<5 x double> %f) {
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v2.d[1], v3.d[0]
 ; CHECK-GI-NEXT:    movi v1.2d, #0x000000ffffffff
-; CHECK-GI-NEXT:    fcvtzu v4.2d, v4.2d
+; CHECK-GI-NEXT:    fcvtzu v3.2d, v4.2d
 ; CHECK-GI-NEXT:    fcvtzu v0.2d, v0.2d
 ; CHECK-GI-NEXT:    fcvtzu v2.2d, v2.2d
-; CHECK-GI-NEXT:    cmhi v3.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    cmhi v4.2d, v1.2d, v0.2d
 ; CHECK-GI-NEXT:    cmhi v5.2d, v1.2d, v2.2d
-; CHECK-GI-NEXT:    bif v0.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    bif v0.16b, v1.16b, v4.16b
 ; CHECK-GI-NEXT:    bif v2.16b, v1.16b, v5.16b
-; CHECK-GI-NEXT:    cmhi v3.2d, v1.2d, v4.2d
-; CHECK-GI-NEXT:    bit v1.16b, v4.16b, v3.16b
-; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v2.4s
-; CHECK-GI-NEXT:    mov v1.s[0], v1.s[0]
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov s3, v0.s[2]
-; CHECK-GI-NEXT:    mov s4, v0.s[3]
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    fmov w4, s1
-; CHECK-GI-NEXT:    fmov w1, s2
-; CHECK-GI-NEXT:    fmov w2, s3
-; CHECK-GI-NEXT:    fmov w3, s4
+; CHECK-GI-NEXT:    cmhi v4.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    bit v1.16b, v3.16b, v4.16b
+; CHECK-GI-NEXT:    mov d3, v0.d[1]
+; CHECK-GI-NEXT:    mov d4, v2.d[1]
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    fmov x2, d2
+; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 killed $x2
+; CHECK-GI-NEXT:    fmov x4, d1
+; CHECK-GI-NEXT:    fmov x1, d3
+; CHECK-GI-NEXT:    fmov x3, d4
+; CHECK-GI-NEXT:    // kill: def $w4 killed $w4 killed $x4
+; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 killed $x1
+; CHECK-GI-NEXT:    // kill: def $w3 killed $w3 killed $x3
 ; CHECK-GI-NEXT:    ret
     %x = call <5 x i32> @llvm.fptoui.sat.v5f64.v5i32(<5 x double> %f)
     ret <5 x i32> %x
@@ -415,40 +417,40 @@ define <6 x i32> @test_unsigned_v6f64_v6i32(<6 x double> %f) {
 ;
 ; CHECK-GI-LABEL: test_unsigned_v6f64_v6i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d4 killed $d4 def $q4
-; CHECK-GI-NEXT:    // kill: def $d5 killed $d5 def $q5
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-GI-NEXT:    // kill: def $d4 killed $d4 def $q4
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    // kill: def $d3 killed $d3 def $q3
-; CHECK-GI-NEXT:    mov v4.d[1], v5.d[0]
+; CHECK-GI-NEXT:    // kill: def $d5 killed $d5 def $q5
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v2.d[1], v3.d[0]
-; CHECK-GI-NEXT:    movi v3.2d, #0x000000ffffffff
-; CHECK-GI-NEXT:    fcvtzu v1.2d, v4.2d
+; CHECK-GI-NEXT:    mov v4.d[1], v5.d[0]
+; CHECK-GI-NEXT:    movi v1.2d, #0x000000ffffffff
 ; CHECK-GI-NEXT:    fcvtzu v0.2d, v0.2d
 ; CHECK-GI-NEXT:    fcvtzu v2.2d, v2.2d
-; CHECK-GI-NEXT:    cmhi v4.2d, v3.2d, v1.2d
-; CHECK-GI-NEXT:    cmhi v5.2d, v3.2d, v2.2d
-; CHECK-GI-NEXT:    bif v1.16b, v3.16b, v4.16b
-; CHECK-GI-NEXT:    cmhi v4.2d, v3.2d, v0.2d
-; CHECK-GI-NEXT:    bif v2.16b, v3.16b, v5.16b
-; CHECK-GI-NEXT:    bif v0.16b, v3.16b, v4.16b
-; CHECK-GI-NEXT:    mov d3, v1.d[1]
-; CHECK-GI-NEXT:    mov v1.s[0], v1.s[0]
-; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v2.4s
-; CHECK-GI-NEXT:    fmov x8, d3
-; CHECK-GI-NEXT:    mov v1.s[1], w8
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov s3, v0.s[2]
-; CHECK-GI-NEXT:    mov s4, v0.s[3]
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    mov s5, v1.s[1]
-; CHECK-GI-NEXT:    fmov w1, s2
-; CHECK-GI-NEXT:    fmov w2, s3
-; CHECK-GI-NEXT:    fmov w3, s4
-; CHECK-GI-NEXT:    fmov w4, s1
-; CHECK-GI-NEXT:    fmov w5, s5
+; CHECK-GI-NEXT:    fcvtzu v3.2d, v4.2d
+; CHECK-GI-NEXT:    cmhi v4.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    cmhi v5.2d, v1.2d, v2.2d
+; CHECK-GI-NEXT:    cmhi v6.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    bif v0.16b, v1.16b, v4.16b
+; CHECK-GI-NEXT:    bif v2.16b, v1.16b, v5.16b
+; CHECK-GI-NEXT:    bit v1.16b, v3.16b, v6.16b
+; CHECK-GI-NEXT:    mov d3, v0.d[1]
+; CHECK-GI-NEXT:    mov d4, v2.d[1]
+; CHECK-GI-NEXT:    mov d5, v1.d[1]
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    fmov x2, d2
+; CHECK-GI-NEXT:    fmov x4, d1
+; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 killed $x2
+; CHECK-GI-NEXT:    // kill: def $w4 killed $w4 killed $x4
+; CHECK-GI-NEXT:    fmov x1, d3
+; CHECK-GI-NEXT:    fmov x3, d4
+; CHECK-GI-NEXT:    fmov x5, d5
+; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 killed $x1
+; CHECK-GI-NEXT:    // kill: def $w3 killed $w3 killed $x3
+; CHECK-GI-NEXT:    // kill: def $w5 killed $w5 killed $x5
 ; CHECK-GI-NEXT:    ret
     %x = call <6 x i32> @llvm.fptoui.sat.v6f64.v6i32(<6 x double> %f)
     ret <6 x i32> %x
@@ -1115,22 +1117,18 @@ define <5 x i32> @test_unsigned_v5f16_v5i32(<5 x half> %f) {
 ;
 ; CHECK-GI-LABEL: test_unsigned_v5f16_v5i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
-; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
-; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT:    fcvtl v1.4s, v0.4h
 ; CHECK-GI-NEXT:    mov v0.h[0], v0.h[4]
-; CHECK-GI-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NEXT:    fcvtzu v1.4s, v1.4s
-; CHECK-GI-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-GI-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NEXT:    mov s2, v1.s[1]
+; CHECK-GI-NEXT:    fcvtzu v0.4s, v0.4s
 ; CHECK-GI-NEXT:    mov s3, v1.s[2]
 ; CHECK-GI-NEXT:    mov s4, v1.s[3]
 ; CHECK-GI-NEXT:    fmov w0, s1
-; CHECK-GI-NEXT:    fmov w4, s0
 ; CHECK-GI-NEXT:    fmov w1, s2
 ; CHECK-GI-NEXT:    fmov w2, s3
+; CHECK-GI-NEXT:    fmov w4, s0
 ; CHECK-GI-NEXT:    fmov w3, s4
 ; CHECK-GI-NEXT:    ret
     %x = call <5 x i32> @llvm.fptoui.sat.v5f16.v5i32(<5 x half> %f)
@@ -1154,26 +1152,22 @@ define <6 x i32> @test_unsigned_v6f16_v6i32(<6 x half> %f) {
 ;
 ; CHECK-GI-LABEL: test_unsigned_v6f16_v6i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
-; CHECK-GI-NEXT:    mov v2.h[0], v0.h[4]
-; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
-; CHECK-GI-NEXT:    mov v2.h[1], v0.h[5]
-; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
-; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
-; CHECK-GI-NEXT:    fcvtl v0.4s, v1.4h
-; CHECK-GI-NEXT:    fcvtl v1.4s, v2.4h
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[4]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[5]
+; CHECK-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-GI-NEXT:    fcvtl v1.4s, v1.4h
 ; CHECK-GI-NEXT:    fcvtzu v0.4s, v0.4s
 ; CHECK-GI-NEXT:    fcvtzu v1.4s, v1.4s
 ; CHECK-GI-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-NEXT:    mov s3, v0.s[2]
-; CHECK-GI-NEXT:    mov s4, v1.s[1]
-; CHECK-GI-NEXT:    mov s5, v0.s[3]
+; CHECK-GI-NEXT:    mov s4, v0.s[3]
 ; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    fmov w4, s1
+; CHECK-GI-NEXT:    mov s5, v1.s[1]
 ; CHECK-GI-NEXT:    fmov w1, s2
 ; CHECK-GI-NEXT:    fmov w2, s3
-; CHECK-GI-NEXT:    fmov w5, s4
-; CHECK-GI-NEXT:    fmov w3, s5
+; CHECK-GI-NEXT:    fmov w3, s4
+; CHECK-GI-NEXT:    fmov w4, s1
+; CHECK-GI-NEXT:    fmov w5, s5
 ; CHECK-GI-NEXT:    ret
     %x = call <6 x i32> @llvm.fptoui.sat.v6f16.v6i32(<6 x half> %f)
     ret <6 x i32> %x
@@ -1197,27 +1191,23 @@ define <7 x i32> @test_unsigned_v7f16_v7i32(<7 x half> %f) {
 ;
 ; CHECK-GI-LABEL: test_unsigned_v7f16_v7i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
-; CHECK-GI-NEXT:    mov v2.h[0], v0.h[4]
-; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
-; CHECK-GI-NEXT:    mov v2.h[1], v0.h[5]
-; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
-; CHECK-GI-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
-; CHECK-GI-NEXT:    fcvtl v0.4s, v1.4h
-; CHECK-GI-NEXT:    fcvtl v1.4s, v2.4h
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[4]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[5]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[6]
+; CHECK-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-GI-NEXT:    fcvtl v1.4s, v1.4h
 ; CHECK-GI-NEXT:    fcvtzu v0.4s, v0.4s
 ; CHECK-GI-NEXT:    fcvtzu v1.4s, v1.4s
 ; CHECK-GI-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-NEXT:    mov s3, v0.s[2]
 ; CHECK-GI-NEXT:    mov s4, v0.s[3]
+; CHECK-GI-NEXT:    fmov w0, s0
 ; CHECK-GI-NEXT:    mov s5, v1.s[1]
 ; CHECK-GI-NEXT:    mov s6, v1.s[2]
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    fmov w4, s1
 ; CHECK-GI-NEXT:    fmov w1, s2
 ; CHECK-GI-NEXT:    fmov w2, s3
 ; CHECK-GI-NEXT:    fmov w3, s4
+; CHECK-GI-NEXT:    fmov w4, s1
 ; CHECK-GI-NEXT:    fmov w5, s5
 ; CHECK-GI-NEXT:    fmov w6, s6
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/frem.ll b/llvm/test/CodeGen/AArch64/frem.ll
index ad8576c63b1aea..cc2443497ad83b 100644
--- a/llvm/test/CodeGen/AArch64/frem.ll
+++ b/llvm/test/CodeGen/AArch64/frem.ll
@@ -886,13 +886,7 @@ define <7 x half> @frem_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NEXT:    ldr q2, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #176
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/fsincos.ll b/llvm/test/CodeGen/AArch64/fsincos.ll
index eac17ec72bc990..4136dfe010eadd 100644
--- a/llvm/test/CodeGen/AArch64/fsincos.ll
+++ b/llvm/test/CodeGen/AArch64/fsincos.ll
@@ -731,13 +731,7 @@ define <7 x half> @sin_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -2045,13 +2039,7 @@ define <7 x half> @cos_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/fsqrt.ll b/llvm/test/CodeGen/AArch64/fsqrt.ll
index 15e93e244f1d5c..7514c9235b0397 100644
--- a/llvm/test/CodeGen/AArch64/fsqrt.ll
+++ b/llvm/test/CodeGen/AArch64/fsqrt.ll
@@ -207,52 +207,27 @@ define <7 x half> @sqrt_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: sqrt_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fsqrt v1.4s, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[3]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    fsqrt v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    fsqrt v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[4], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[5], v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[6], v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[6]
+; CHECK-GI-NOFP16-NEXT:    fsqrt v2.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: sqrt_v7f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    mov v1.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT:    mov v1.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT:    mov v1.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT:    mov v1.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT:    mov v1.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT:    fsqrt v1.8h, v1.8h
-; CHECK-GI-FP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT:    fsqrt v0.8h, v0.8h
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = call <7 x half> @llvm.sqrt.v7f16(<7 x half> %a)
diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll
index 517cf7c4352fd3..a93a089cda3be2 100644
--- a/llvm/test/CodeGen/AArch64/load.ll
+++ b/llvm/test/CodeGen/AArch64/load.ll
@@ -238,38 +238,20 @@ define <7 x i8> @load_v7i8(ptr %ptr){
 ;
 ; CHECK-GI-LABEL: load_v7i8:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x0, #1]
-; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], w9
-; CHECK-GI-NEXT:    mov v0.h[2], w8
-; CHECK-GI-NEXT:    ldrb w8, [x0, #3]
-; CHECK-GI-NEXT:    mov v0.h[3], w8
-; CHECK-GI-NEXT:    ldrb w8, [x0, #4]
-; CHECK-GI-NEXT:    mov v0.h[4], w8
-; CHECK-GI-NEXT:    ldrb w8, [x0, #5]
-; CHECK-GI-NEXT:    mov v0.h[5], w8
-; CHECK-GI-NEXT:    ldrb w8, [x0, #6]
-; CHECK-GI-NEXT:    mov v0.h[6], w8
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[3]
-; CHECK-GI-NEXT:    mov h3, v0.h[4]
-; CHECK-GI-NEXT:    mov h4, v0.h[5]
-; CHECK-GI-NEXT:    mov h5, v0.h[6]
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    mov h1, v0.h[2]
-; CHECK-GI-NEXT:    mov v0.b[1], w8
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    mov v0.b[2], w8
-; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    mov v0.b[3], w8
-; CHECK-GI-NEXT:    fmov w8, s3
-; CHECK-GI-NEXT:    mov v0.b[4], w8
-; CHECK-GI-NEXT:    fmov w8, s4
-; CHECK-GI-NEXT:    mov v0.b[5], w8
-; CHECK-GI-NEXT:    fmov w8, s5
-; CHECK-GI-NEXT:    mov v0.b[6], w8
+; CHECK-GI-NEXT:    ldr b0, [x0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #1]
+; CHECK-GI-NEXT:    mov v0.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #2]
+; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #3]
+; CHECK-GI-NEXT:    mov v0.b[3], v1.b[0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #4]
+; CHECK-GI-NEXT:    mov v0.b[4], v1.b[0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #5]
+; CHECK-GI-NEXT:    mov v0.b[5], v1.b[0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #6]
+; CHECK-GI-NEXT:    mov v0.b[6], v1.b[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %a = load <7 x i8>, ptr %ptr
@@ -307,26 +289,19 @@ define <7 x i16> @load_v7i16(ptr %ptr){
 ;
 ; CHECK-GI-LABEL: load_v7i16:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr h1, [x0]
-; CHECK-GI-NEXT:    ldr h0, [x0, #2]
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
 ; CHECK-GI-NEXT:    add x8, x0, #4
-; CHECK-GI-NEXT:    mov v1.h[1], v0.h[0]
-; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x8]
+; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x8]
 ; CHECK-GI-NEXT:    add x8, x0, #6
-; CHECK-GI-NEXT:    ld1 { v1.h }[3], [x8]
+; CHECK-GI-NEXT:    ld1 { v0.h }[3], [x8]
 ; CHECK-GI-NEXT:    add x8, x0, #8
-; CHECK-GI-NEXT:    ld1 { v1.h }[4], [x8]
+; CHECK-GI-NEXT:    ld1 { v0.h }[4], [x8]
 ; CHECK-GI-NEXT:    add x8, x0, #10
-; CHECK-GI-NEXT:    ld1 { v1.h }[5], [x8]
+; CHECK-GI-NEXT:    ld1 { v0.h }[5], [x8]
 ; CHECK-GI-NEXT:    add x8, x0, #12
-; CHECK-GI-NEXT:    ld1 { v1.h }[6], [x8]
-; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT:    ld1 { v0.h }[6], [x8]
 ; CHECK-GI-NEXT:    ret
     %a = load <7 x i16>, ptr %ptr
     ret <7 x i16> %a
diff --git a/llvm/test/CodeGen/AArch64/shift.ll b/llvm/test/CodeGen/AArch64/shift.ll
index c8344a39da56a7..a9517383cae0db 100644
--- a/llvm/test/CodeGen/AArch64/shift.ll
+++ b/llvm/test/CodeGen/AArch64/shift.ll
@@ -1086,80 +1086,10 @@ define <3 x i8> @shl_v3i8(<3 x i8> %0, <3 x i8> %1){
 }
 
 define <7 x i8> @shl_v7i8(<7 x i8> %0, <7 x i8> %1){
-; CHECK-SD-LABEL: shl_v7i8:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ushl v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: shl_v7i8:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov b3, v0.b[1]
-; CHECK-GI-NEXT:    mov b4, v1.b[1]
-; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
-; CHECK-GI-NEXT:    mov v5.b[0], v1.b[0]
-; CHECK-GI-NEXT:    mov b6, v0.b[2]
-; CHECK-GI-NEXT:    mov b7, v1.b[2]
-; CHECK-GI-NEXT:    mov v2.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov v5.b[1], v4.b[0]
-; CHECK-GI-NEXT:    mov b4, v1.b[3]
-; CHECK-GI-NEXT:    mov v2.b[2], v6.b[0]
-; CHECK-GI-NEXT:    mov b6, v0.b[4]
-; CHECK-GI-NEXT:    mov v5.b[2], v7.b[0]
-; CHECK-GI-NEXT:    mov b7, v1.b[4]
-; CHECK-GI-NEXT:    mov v2.b[3], v3.b[0]
-; CHECK-GI-NEXT:    mov b3, v0.b[5]
-; CHECK-GI-NEXT:    mov b0, v0.b[6]
-; CHECK-GI-NEXT:    mov v5.b[3], v4.b[0]
-; CHECK-GI-NEXT:    mov b4, v1.b[5]
-; CHECK-GI-NEXT:    mov b1, v1.b[6]
-; CHECK-GI-NEXT:    mov v2.b[4], v6.b[0]
-; CHECK-GI-NEXT:    mov v5.b[4], v7.b[0]
-; CHECK-GI-NEXT:    mov v2.b[5], v3.b[0]
-; CHECK-GI-NEXT:    mov v5.b[5], v4.b[0]
-; CHECK-GI-NEXT:    mov v2.b[6], v0.b[0]
-; CHECK-GI-NEXT:    mov v5.b[6], v1.b[0]
-; CHECK-GI-NEXT:    ushl v0.8b, v2.8b, v5.8b
-; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov b4, v0.b[4]
-; CHECK-GI-NEXT:    mov b5, v0.b[6]
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    mov b1, v0.b[5]
-; CHECK-GI-NEXT:    mov v0.h[1], w8
-; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    mov v0.h[2], w8
-; CHECK-GI-NEXT:    fmov w8, s3
-; CHECK-GI-NEXT:    mov v0.h[3], w8
-; CHECK-GI-NEXT:    fmov w8, s4
-; CHECK-GI-NEXT:    mov v0.h[4], w8
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    mov v0.h[5], w8
-; CHECK-GI-NEXT:    fmov w8, s5
-; CHECK-GI-NEXT:    mov v0.h[6], w8
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[3]
-; CHECK-GI-NEXT:    mov h3, v0.h[4]
-; CHECK-GI-NEXT:    mov h4, v0.h[5]
-; CHECK-GI-NEXT:    mov h5, v0.h[6]
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    mov h1, v0.h[2]
-; CHECK-GI-NEXT:    mov v0.b[1], w8
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    mov v0.b[2], w8
-; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    mov v0.b[3], w8
-; CHECK-GI-NEXT:    fmov w8, s3
-; CHECK-GI-NEXT:    mov v0.b[4], w8
-; CHECK-GI-NEXT:    fmov w8, s4
-; CHECK-GI-NEXT:    mov v0.b[5], w8
-; CHECK-GI-NEXT:    fmov w8, s5
-; CHECK-GI-NEXT:    mov v0.b[6], w8
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: shl_v7i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushl v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
     %3 = shl <7 x i8> %0, %1
     ret <7 x i8> %3
 }
@@ -1198,36 +1128,10 @@ define <3 x i16> @shl_v3i16(<3 x i16> %0, <3 x i16> %1){
 }
 
 define <7 x i16> @shl_v7i16(<7 x i16> %0, <7 x i16> %1){
-; CHECK-SD-LABEL: shl_v7i16:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ushl v0.8h, v0.8h, v1.8h
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: shl_v7i16:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov v2.h[0], v0.h[0]
-; CHECK-GI-NEXT:    mov v3.h[0], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v0.h[1]
-; CHECK-GI-NEXT:    mov v3.h[1], v1.h[1]
-; CHECK-GI-NEXT:    mov v2.h[2], v0.h[2]
-; CHECK-GI-NEXT:    mov v3.h[2], v1.h[2]
-; CHECK-GI-NEXT:    mov v2.h[3], v0.h[3]
-; CHECK-GI-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-GI-NEXT:    mov v2.h[4], v0.h[4]
-; CHECK-GI-NEXT:    mov v3.h[4], v1.h[4]
-; CHECK-GI-NEXT:    mov v2.h[5], v0.h[5]
-; CHECK-GI-NEXT:    mov v3.h[5], v1.h[5]
-; CHECK-GI-NEXT:    mov v2.h[6], v0.h[6]
-; CHECK-GI-NEXT:    mov v3.h[6], v1.h[6]
-; CHECK-GI-NEXT:    ushl v1.8h, v2.8h, v3.8h
-; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: shl_v7i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushl v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
     %3 = shl <7 x i16> %0, %1
     ret <7 x i16> %3
 }
@@ -1301,82 +1205,11 @@ define <3 x i8> @ashr_v3i8(<3 x i8> %0, <3 x i8> %1){
 }
 
 define <7 x i8> @ashr_v7i8(<7 x i8> %0, <7 x i8> %1){
-; CHECK-SD-LABEL: ashr_v7i8:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    neg v1.8b, v1.8b
-; CHECK-SD-NEXT:    sshl v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: ashr_v7i8:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov b2, v1.b[1]
-; CHECK-GI-NEXT:    mov v3.b[0], v1.b[0]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov b4, v0.b[1]
-; CHECK-GI-NEXT:    mov v5.b[0], v0.b[0]
-; CHECK-GI-NEXT:    mov b6, v1.b[2]
-; CHECK-GI-NEXT:    mov v3.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    mov v5.b[1], v4.b[0]
-; CHECK-GI-NEXT:    mov b4, v1.b[3]
-; CHECK-GI-NEXT:    mov v3.b[2], v6.b[0]
-; CHECK-GI-NEXT:    mov b6, v0.b[3]
-; CHECK-GI-NEXT:    mov v5.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[4]
-; CHECK-GI-NEXT:    mov v3.b[3], v4.b[0]
-; CHECK-GI-NEXT:    mov b4, v0.b[4]
-; CHECK-GI-NEXT:    mov v5.b[3], v6.b[0]
-; CHECK-GI-NEXT:    mov b6, v1.b[5]
-; CHECK-GI-NEXT:    mov b1, v1.b[6]
-; CHECK-GI-NEXT:    mov v3.b[4], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v0.b[5]
-; CHECK-GI-NEXT:    mov b0, v0.b[6]
-; CHECK-GI-NEXT:    mov v5.b[4], v4.b[0]
-; CHECK-GI-NEXT:    mov v3.b[5], v6.b[0]
-; CHECK-GI-NEXT:    mov v5.b[5], v2.b[0]
-; CHECK-GI-NEXT:    mov v3.b[6], v1.b[0]
-; CHECK-GI-NEXT:    mov v5.b[6], v0.b[0]
-; CHECK-GI-NEXT:    neg v0.8b, v3.8b
-; CHECK-GI-NEXT:    sshl v0.8b, v5.8b, v0.8b
-; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov b4, v0.b[4]
-; CHECK-GI-NEXT:    mov b5, v0.b[6]
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    mov b1, v0.b[5]
-; CHECK-GI-NEXT:    mov v0.h[1], w8
-; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    mov v0.h[2], w8
-; CHECK-GI-NEXT:    fmov w8, s3
-; CHECK-GI-NEXT:    mov v0.h[3], w8
-; CHECK-GI-NEXT:    fmov w8, s4
-; CHECK-GI-NEXT:    mov v0.h[4], w8
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    mov v0.h[5], w8
-; CHECK-GI-NEXT:    fmov w8, s5
-; CHECK-GI-NEXT:    mov v0.h[6], w8
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[3]
-; CHECK-GI-NEXT:    mov h3, v0.h[4]
-; CHECK-GI-NEXT:    mov h4, v0.h[5]
-; CHECK-GI-NEXT:    mov h5, v0.h[6]
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    mov h1, v0.h[2]
-; CHECK-GI-NEXT:    mov v0.b[1], w8
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    mov v0.b[2], w8
-; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    mov v0.b[3], w8
-; CHECK-GI-NEXT:    fmov w8, s3
-; CHECK-GI-NEXT:    mov v0.b[4], w8
-; CHECK-GI-NEXT:    fmov w8, s4
-; CHECK-GI-NEXT:    mov v0.b[5], w8
-; CHECK-GI-NEXT:    fmov w8, s5
-; CHECK-GI-NEXT:    mov v0.b[6], w8
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: ashr_v7i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.8b, v1.8b
+; CHECK-NEXT:    sshl v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
     %3 = ashr <7 x i8> %0, %1
     ret <7 x i8> %3
 }
@@ -1417,38 +1250,11 @@ define <3 x i16> @ashr_v3i16(<3 x i16> %0, <3 x i16> %1){
 }
 
 define <7 x i16> @ashr_v7i16(<7 x i16> %0, <7 x i16> %1){
-; CHECK-SD-LABEL: ashr_v7i16:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    neg v1.8h, v1.8h
-; CHECK-SD-NEXT:    sshl v0.8h, v0.8h, v1.8h
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: ashr_v7i16:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov v2.h[0], v1.h[0]
-; CHECK-GI-NEXT:    mov v3.h[0], v0.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v1.h[1]
-; CHECK-GI-NEXT:    mov v3.h[1], v0.h[1]
-; CHECK-GI-NEXT:    mov v2.h[2], v1.h[2]
-; CHECK-GI-NEXT:    mov v3.h[2], v0.h[2]
-; CHECK-GI-NEXT:    mov v2.h[3], v1.h[3]
-; CHECK-GI-NEXT:    mov v3.h[3], v0.h[3]
-; CHECK-GI-NEXT:    mov v2.h[4], v1.h[4]
-; CHECK-GI-NEXT:    mov v3.h[4], v0.h[4]
-; CHECK-GI-NEXT:    mov v2.h[5], v1.h[5]
-; CHECK-GI-NEXT:    mov v3.h[5], v0.h[5]
-; CHECK-GI-NEXT:    mov v2.h[6], v1.h[6]
-; CHECK-GI-NEXT:    mov v3.h[6], v0.h[6]
-; CHECK-GI-NEXT:    neg v0.8h, v2.8h
-; CHECK-GI-NEXT:    sshl v1.8h, v3.8h, v0.8h
-; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: ashr_v7i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.8h, v1.8h
+; CHECK-NEXT:    sshl v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
     %3 = ashr <7 x i16> %0, %1
     ret <7 x i16> %3
 }
@@ -1523,82 +1329,11 @@ define <3 x i8> @lshr_v3i8(<3 x i8> %0, <3 x i8> %1){
 }
 
 define <7 x i8> @lshr_v7i8(<7 x i8> %0, <7 x i8> %1){
-; CHECK-SD-LABEL: lshr_v7i8:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    neg v1.8b, v1.8b
-; CHECK-SD-NEXT:    ushl v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: lshr_v7i8:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov b2, v1.b[1]
-; CHECK-GI-NEXT:    mov v3.b[0], v1.b[0]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov b4, v0.b[1]
-; CHECK-GI-NEXT:    mov v5.b[0], v0.b[0]
-; CHECK-GI-NEXT:    mov b6, v1.b[2]
-; CHECK-GI-NEXT:    mov v3.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    mov v5.b[1], v4.b[0]
-; CHECK-GI-NEXT:    mov b4, v1.b[3]
-; CHECK-GI-NEXT:    mov v3.b[2], v6.b[0]
-; CHECK-GI-NEXT:    mov b6, v0.b[3]
-; CHECK-GI-NEXT:    mov v5.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[4]
-; CHECK-GI-NEXT:    mov v3.b[3], v4.b[0]
-; CHECK-GI-NEXT:    mov b4, v0.b[4]
-; CHECK-GI-NEXT:    mov v5.b[3], v6.b[0]
-; CHECK-GI-NEXT:    mov b6, v1.b[5]
-; CHECK-GI-NEXT:    mov b1, v1.b[6]
-; CHECK-GI-NEXT:    mov v3.b[4], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v0.b[5]
-; CHECK-GI-NEXT:    mov b0, v0.b[6]
-; CHECK-GI-NEXT:    mov v5.b[4], v4.b[0]
-; CHECK-GI-NEXT:    mov v3.b[5], v6.b[0]
-; CHECK-GI-NEXT:    mov v5.b[5], v2.b[0]
-; CHECK-GI-NEXT:    mov v3.b[6], v1.b[0]
-; CHECK-GI-NEXT:    mov v5.b[6], v0.b[0]
-; CHECK-GI-NEXT:    neg v0.8b, v3.8b
-; CHECK-GI-NEXT:    ushl v0.8b, v5.8b, v0.8b
-; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov b4, v0.b[4]
-; CHECK-GI-NEXT:    mov b5, v0.b[6]
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    mov b1, v0.b[5]
-; CHECK-GI-NEXT:    mov v0.h[1], w8
-; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    mov v0.h[2], w8
-; CHECK-GI-NEXT:    fmov w8, s3
-; CHECK-GI-NEXT:    mov v0.h[3], w8
-; CHECK-GI-NEXT:    fmov w8, s4
-; CHECK-GI-NEXT:    mov v0.h[4], w8
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    mov v0.h[5], w8
-; CHECK-GI-NEXT:    fmov w8, s5
-; CHECK-GI-NEXT:    mov v0.h[6], w8
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[3]
-; CHECK-GI-NEXT:    mov h3, v0.h[4]
-; CHECK-GI-NEXT:    mov h4, v0.h[5]
-; CHECK-GI-NEXT:    mov h5, v0.h[6]
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    mov h1, v0.h[2]
-; CHECK-GI-NEXT:    mov v0.b[1], w8
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    mov v0.b[2], w8
-; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    mov v0.b[3], w8
-; CHECK-GI-NEXT:    fmov w8, s3
-; CHECK-GI-NEXT:    mov v0.b[4], w8
-; CHECK-GI-NEXT:    fmov w8, s4
-; CHECK-GI-NEXT:    mov v0.b[5], w8
-; CHECK-GI-NEXT:    fmov w8, s5
-; CHECK-GI-NEXT:    mov v0.b[6], w8
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: lshr_v7i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.8b, v1.8b
+; CHECK-NEXT:    ushl v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
     %3 = lshr <7 x i8> %0, %1
     ret <7 x i8> %3
 }
@@ -1639,38 +1374,11 @@ define <3 x i16> @lshr_v3i16(<3 x i16> %0, <3 x i16> %1){
 }
 
 define <7 x i16> @lshr_v7i16(<7 x i16> %0, <7 x i16> %1){
-; CHECK-SD-LABEL: lshr_v7i16:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    neg v1.8h, v1.8h
-; CHECK-SD-NEXT:    ushl v0.8h, v0.8h, v1.8h
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: lshr_v7i16:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov v2.h[0], v1.h[0]
-; CHECK-GI-NEXT:    mov v3.h[0], v0.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v1.h[1]
-; CHECK-GI-NEXT:    mov v3.h[1], v0.h[1]
-; CHECK-GI-NEXT:    mov v2.h[2], v1.h[2]
-; CHECK-GI-NEXT:    mov v3.h[2], v0.h[2]
-; CHECK-GI-NEXT:    mov v2.h[3], v1.h[3]
-; CHECK-GI-NEXT:    mov v3.h[3], v0.h[3]
-; CHECK-GI-NEXT:    mov v2.h[4], v1.h[4]
-; CHECK-GI-NEXT:    mov v3.h[4], v0.h[4]
-; CHECK-GI-NEXT:    mov v2.h[5], v1.h[5]
-; CHECK-GI-NEXT:    mov v3.h[5], v0.h[5]
-; CHECK-GI-NEXT:    mov v2.h[6], v1.h[6]
-; CHECK-GI-NEXT:    mov v3.h[6], v0.h[6]
-; CHECK-GI-NEXT:    neg v0.8h, v2.8h
-; CHECK-GI-NEXT:    ushl v1.8h, v3.8h, v0.8h
-; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT:    mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT:    mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT:    mov v0.h[6], v1.h[6]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: lshr_v7i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.8h, v1.8h
+; CHECK-NEXT:    ushl v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
     %3 = lshr <7 x i16> %0, %1
     ret <7 x i16> %3
 }
diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll
index db0fd4293e084b..02142f9b9e71dd 100644
--- a/llvm/test/CodeGen/AArch64/shufflevector.ll
+++ b/llvm/test/CodeGen/AArch64/shufflevector.ll
@@ -544,62 +544,12 @@ define <7 x i8> @shufflevector_v7i8(<7 x i8> %a, <7 x i8> %b) {
 ; CHECK-GI-LABEL: shufflevector_v7i8:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov b2, v0.b[1]
-; CHECK-GI-NEXT:    mov b3, v0.b[2]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov b4, v0.b[3]
-; CHECK-GI-NEXT:    mov b5, v0.b[4]
-; CHECK-GI-NEXT:    mov b6, v0.b[5]
-; CHECK-GI-NEXT:    mov b7, v1.b[3]
-; CHECK-GI-NEXT:    mov b16, v1.b[4]
-; CHECK-GI-NEXT:    mov b17, v1.b[5]
-; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    mov b2, v0.b[6]
-; CHECK-GI-NEXT:    mov v0.h[1], w8
-; CHECK-GI-NEXT:    fmov w8, s3
-; CHECK-GI-NEXT:    mov b3, v1.b[1]
-; CHECK-GI-NEXT:    mov v0.h[2], w8
-; CHECK-GI-NEXT:    fmov w8, s4
-; CHECK-GI-NEXT:    mov b4, v1.b[2]
-; CHECK-GI-NEXT:    fmov w9, s3
-; CHECK-GI-NEXT:    mov b3, v1.b[6]
-; CHECK-GI-NEXT:    mov v0.h[3], w8
-; CHECK-GI-NEXT:    fmov w8, s5
-; CHECK-GI-NEXT:    mov v1.h[1], w9
-; CHECK-GI-NEXT:    fmov w9, s4
-; CHECK-GI-NEXT:    mov v0.h[4], w8
-; CHECK-GI-NEXT:    fmov w8, s6
-; CHECK-GI-NEXT:    mov v1.h[2], w9
-; CHECK-GI-NEXT:    fmov w9, s7
-; CHECK-GI-NEXT:    mov v0.h[5], w8
-; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    mov v1.h[3], w9
-; CHECK-GI-NEXT:    mov v0.h[6], w8
-; CHECK-GI-NEXT:    fmov w8, s16
-; CHECK-GI-NEXT:    mov v1.h[4], w8
-; CHECK-GI-NEXT:    fmov w8, s17
-; CHECK-GI-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-NEXT:    mov h0, v0.h[5]
-; CHECK-GI-NEXT:    mov v1.h[5], w8
-; CHECK-GI-NEXT:    fmov w8, s3
-; CHECK-GI-NEXT:    fmov w9, s4
-; CHECK-GI-NEXT:    mov v2.b[1], w9
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov v1.h[6], w8
-; CHECK-GI-NEXT:    mov v2.b[2], w9
-; CHECK-GI-NEXT:    mov h0, v1.h[1]
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    mov h3, v1.h[3]
-; CHECK-GI-NEXT:    mov v2.b[3], w8
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov h0, v1.h[5]
-; CHECK-GI-NEXT:    mov v2.b[4], w8
-; CHECK-GI-NEXT:    fmov w8, s3
-; CHECK-GI-NEXT:    mov v2.b[5], w8
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov v2.b[6], w8
-; CHECK-GI-NEXT:    fmov d0, d2
+; CHECK-GI-NEXT:    adrp x8, .LCPI31_0
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI31_0]
+; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %c = shufflevector <7 x i8> %a, <7 x i8> %b, <7 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12>
     ret <7 x i8> %c
@@ -645,27 +595,11 @@ define <7 x i16> @shufflevector_v7i16(<7 x i16> %a, <7 x i16> %b) {
 ;
 ; CHECK-GI-LABEL: shufflevector_v7i16:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov v2.h[0], v0.h[0]
-; CHECK-GI-NEXT:    mov v3.h[0], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v0.h[1]
-; CHECK-GI-NEXT:    mov v3.h[1], v1.h[1]
-; CHECK-GI-NEXT:    mov v2.h[2], v0.h[2]
-; CHECK-GI-NEXT:    mov v3.h[2], v1.h[2]
-; CHECK-GI-NEXT:    mov v2.h[3], v0.h[3]
-; CHECK-GI-NEXT:    mov v3.h[3], v1.h[3]
-; CHECK-GI-NEXT:    mov v2.h[4], v0.h[4]
-; CHECK-GI-NEXT:    mov v3.h[4], v1.h[4]
-; CHECK-GI-NEXT:    mov v2.h[5], v0.h[5]
-; CHECK-GI-NEXT:    mov v3.h[5], v1.h[5]
-; CHECK-GI-NEXT:    mov v2.h[6], v0.h[6]
-; CHECK-GI-NEXT:    mov v3.h[6], v1.h[6]
-; CHECK-GI-NEXT:    mov v0.h[0], v2.h[1]
-; CHECK-GI-NEXT:    mov v0.h[1], v2.h[3]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[5]
-; CHECK-GI-NEXT:    mov v0.h[3], v3.h[0]
-; CHECK-GI-NEXT:    mov v0.h[4], v3.h[1]
-; CHECK-GI-NEXT:    mov v0.h[5], v3.h[3]
-; CHECK-GI-NEXT:    mov v0.h[6], v3.h[5]
+; CHECK-GI-NEXT:    adrp x8, .LCPI33_0
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI33_0]
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-GI-NEXT:    ret
     %c = shufflevector <7 x i16> %a, <7 x i16> %b, <7 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12>
     ret <7 x i16> %c
@@ -714,47 +648,11 @@ define <3 x i8> @shufflevector_v3i8_zeroes(<3 x i8> %a, <3 x i8> %b) {
 }
 
 define <7 x i8> @shufflevector_v7i8_zeroes(<7 x i8> %a, <7 x i8> %b) {
-; CHECK-SD-LABEL: shufflevector_v7i8_zeroes:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    dup v0.8b, v0.b[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: shufflevector_v7i8_zeroes:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov b4, v0.b[4]
-; CHECK-GI-NEXT:    mov b5, v0.b[6]
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    mov b1, v0.b[5]
-; CHECK-GI-NEXT:    mov v0.h[1], w8
-; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    mov v0.h[2], w8
-; CHECK-GI-NEXT:    fmov w8, s3
-; CHECK-GI-NEXT:    mov v0.h[3], w8
-; CHECK-GI-NEXT:    fmov w8, s4
-; CHECK-GI-NEXT:    mov v0.h[4], w8
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    mov v0.h[5], w8
-; CHECK-GI-NEXT:    fmov w8, s5
-; CHECK-GI-NEXT:    mov v0.h[6], w8
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    fmov w10, s0
-; CHECK-GI-NEXT:    fmov w11, s0
-; CHECK-GI-NEXT:    fmov w12, s0
-; CHECK-GI-NEXT:    fmov w13, s0
-; CHECK-GI-NEXT:    mov v0.b[1], w8
-; CHECK-GI-NEXT:    mov v0.b[2], w9
-; CHECK-GI-NEXT:    mov v0.b[3], w10
-; CHECK-GI-NEXT:    mov v0.b[4], w11
-; CHECK-GI-NEXT:    mov v0.b[5], w12
-; CHECK-GI-NEXT:    mov v0.b[6], w13
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: shufflevector_v7i8_zeroes:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.8b, v0.b[0]
+; CHECK-NEXT:    ret
     %c = shufflevector <7 x i8> %a, <7 x i8> %b, <7 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
     ret <7 x i8> %c
 }
@@ -785,28 +683,10 @@ define <3 x i16> @shufflevector_v3i16_zeroes(<3 x i16> %a, <3 x i16> %b) {
 }
 
 define <7 x i16> @shufflevector_v7i16_zeroes(<7 x i16> %a, <7 x i16> %b) {
-; CHECK-SD-LABEL: shufflevector_v7i16_zeroes:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    dup v0.8h, v0.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: shufflevector_v7i16_zeroes:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
-; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
-; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
-; CHECK-GI-NEXT:    mov v1.h[4], v0.h[4]
-; CHECK-GI-NEXT:    mov v1.h[5], v0.h[5]
-; CHECK-GI-NEXT:    mov v1.h[6], v0.h[6]
-; CHECK-GI-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[5], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: shufflevector_v7i16_zeroes:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    ret
     %c = shufflevector <7 x i16> %a, <7 x i16> %b, <7 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
     ret <7 x i16> %c
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
index 4d400d53916f16..f426fb8954ed26 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
@@ -2115,7 +2115,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %
   ; GFX7-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
   ; GFX7-NEXT: {{  $}}
   ; GFX7-NEXT: bb.2:
-  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
+  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
   ; GFX7-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2172,7 +2172,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %
   ; GFX12-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
   ; GFX12-NEXT: {{  $}}
   ; GFX12-NEXT: bb.2:
-  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %41, %bb.3
+  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2237,7 +2237,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %
   ; GFX7-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
   ; GFX7-NEXT: {{  $}}
   ; GFX7-NEXT: bb.2:
-  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %43, %bb.3
+  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.3
   ; GFX7-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2294,7 +2294,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %
   ; GFX12-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
   ; GFX12-NEXT: {{  $}}
   ; GFX12-NEXT: bb.2:
-  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %41, %bb.3
+  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2357,7 +2357,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %
   ; GFX7-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
   ; GFX7-NEXT: {{  $}}
   ; GFX7-NEXT: bb.2:
-  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %43, %bb.3
+  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.3
   ; GFX7-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2414,7 +2414,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %
   ; GFX12-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
   ; GFX12-NEXT: {{  $}}
   ; GFX12-NEXT: bb.2:
-  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %41, %bb.3
+  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2476,7 +2476,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
   ; GFX7-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
   ; GFX7-NEXT: {{  $}}
   ; GFX7-NEXT: bb.2:
-  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
+  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
   ; GFX7-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2534,7 +2534,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
   ; GFX12-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
   ; GFX12-NEXT: {{  $}}
   ; GFX12-NEXT: bb.2:
-  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
+  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2596,7 +2596,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
   ; GFX7-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
   ; GFX7-NEXT: {{  $}}
   ; GFX7-NEXT: bb.2:
-  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
+  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
   ; GFX7-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2654,7 +2654,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
   ; GFX12-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
   ; GFX12-NEXT: {{  $}}
   ; GFX12-NEXT: bb.2:
-  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
+  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2716,7 +2716,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
   ; GFX7-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
   ; GFX7-NEXT: {{  $}}
   ; GFX7-NEXT: bb.2:
-  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
+  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
   ; GFX7-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2774,7 +2774,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
   ; GFX12-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
   ; GFX12-NEXT: {{  $}}
   ; GFX12-NEXT: bb.2:
-  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
+  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2835,7 +2835,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4
   ; GFX7-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
   ; GFX7-NEXT: {{  $}}
   ; GFX7-NEXT: bb.2:
-  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
+  ; GFX7-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
   ; GFX7-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX7-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2891,7 +2891,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4
   ; GFX12-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
   ; GFX12-NEXT: {{  $}}
   ; GFX12-NEXT: bb.2:
-  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %41, %bb.3
+  ; GFX12-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3
   ; GFX12-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
   ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec



More information about the llvm-commits mailing list