[llvm] [GlobalIsel] Combine G_UNMERGE_VALUES from opaque vectors into scalars (PR #113040)
Thorsten Schütt via llvm-commits
llvm-commits at lists.llvm.org
Sat Oct 19 09:53:29 PDT 2024
https://github.com/tschuett updated https://github.com/llvm/llvm-project/pull/113040
>From d08de498d8b234d75e791665e28f7811fb499d27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Sat, 19 Oct 2024 15:58:45 +0200
Subject: [PATCH 1/2] [GlobalIsel] Combine G_UNMERGE_VALUES from opaque vectors
into scalars
%opaque:_(<2 x s64>) = G_OPAQUE
%un1:_(s64), %un2:_(s64) = G_UNMERGE_VALUES %opaque(<2 x s64>)
->
%zero:_(s64) = G_CONSTANT i64 0
%one:_(s64) = G_CONSTANT i64 1
%un1:_(s64) = G_EXTRACT_VECTOR_ELT %opaque, $zero
%un2:_(s64) = G_EXTRACT_VECTOR_ELT %opaque, $one
unable to legalize instruction: %5:_(s128) = G_EXTRACT_VECTOR_ELT %3:_(<2 x s128>), %7:_(s64) (in function: fabs_v2f128)
Test:
llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
---
.../llvm/CodeGen/GlobalISel/CombinerHelper.h | 4 +
.../include/llvm/Target/GlobalISel/Combine.td | 11 +-
llvm/lib/CodeGen/GlobalISel/CMakeLists.txt | 1 +
.../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 82 -
.../GlobalISel/CombinerHelperArtifacts.cpp | 169 ++
llvm/lib/Target/AArch64/AArch64Combine.td | 2 +-
.../AArch64/GlobalISel/combine-unmerge.mir | 104 +-
llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll | 146 +-
llvm/test/CodeGen/AArch64/abs.ll | 152 +-
llvm/test/CodeGen/AArch64/add.ll | 25 +-
llvm/test/CodeGen/AArch64/andorxor.ll | 75 +-
llvm/test/CodeGen/AArch64/arm64-fp128.ll | 7 +-
llvm/test/CodeGen/AArch64/bitcast.ll | 25 +-
llvm/test/CodeGen/AArch64/bswap.ll | 74 +-
llvm/test/CodeGen/AArch64/fabs.ll | 54 +-
llvm/test/CodeGen/AArch64/faddsub.ll | 178 +-
llvm/test/CodeGen/AArch64/fcmp.ll | 405 +--
llvm/test/CodeGen/AArch64/fcopysign.ll | 85 +-
llvm/test/CodeGen/AArch64/fcvt.ll | 511 +++-
llvm/test/CodeGen/AArch64/fdiv.ll | 89 +-
llvm/test/CodeGen/AArch64/fexplog.ll | 230 +-
.../AArch64/fixed-vector-deinterleave.ll | 8 +-
llvm/test/CodeGen/AArch64/fminimummaximum.ll | 178 +-
llvm/test/CodeGen/AArch64/fminmax.ll | 178 +-
llvm/test/CodeGen/AArch64/fmla.ll | 320 ++-
llvm/test/CodeGen/AArch64/fmul.ll | 89 +-
llvm/test/CodeGen/AArch64/fneg.ll | 52 +-
llvm/test/CodeGen/AArch64/fpext.ll | 39 +-
llvm/test/CodeGen/AArch64/fpow.ll | 58 +-
llvm/test/CodeGen/AArch64/fpowi.ll | 44 +-
llvm/test/CodeGen/AArch64/fptoi.ll | 518 +++-
.../test/CodeGen/AArch64/fptosi-sat-vector.ll | 217 +-
.../test/CodeGen/AArch64/fptoui-sat-vector.ll | 194 +-
llvm/test/CodeGen/AArch64/fptrunc.ll | 58 +-
llvm/test/CodeGen/AArch64/frem.ll | 58 +-
llvm/test/CodeGen/AArch64/fsincos.ll | 92 +-
llvm/test/CodeGen/AArch64/fsqrt.ll | 69 +-
llvm/test/CodeGen/AArch64/icmp.ll | 56 +-
llvm/test/CodeGen/AArch64/insertextract.ll | 30 +-
llvm/test/CodeGen/AArch64/itofp.ll | 714 ++++--
llvm/test/CodeGen/AArch64/llvm.exp10.ll | 81 +-
llvm/test/CodeGen/AArch64/load.ll | 104 +-
llvm/test/CodeGen/AArch64/mul.ll | 29 +-
llvm/test/CodeGen/AArch64/rem.ll | 126 +-
llvm/test/CodeGen/AArch64/sext.ll | 118 +-
llvm/test/CodeGen/AArch64/shift.ll | 564 ++++-
llvm/test/CodeGen/AArch64/shufflevector.ll | 289 ++-
llvm/test/CodeGen/AArch64/sub.ll | 25 +-
.../AArch64/vecreduce-umax-legalization.ll | 22 +-
llvm/test/CodeGen/AArch64/xtn.ll | 17 +-
llvm/test/CodeGen/AArch64/zext.ll | 98 +-
.../CodeGen/AMDGPU/GlobalISel/add.v2i16.ll | 57 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll | 14 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll | 9 +
.../combine-fma-add-mul-pre-legalize.mir | 240 +-
.../GlobalISel/combine-fma-unmerge-values.mir | 60 +-
.../CodeGen/AMDGPU/GlobalISel/dummy-target.ll | 16 +-
.../CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll | 382 +--
llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll | 82 +-
.../CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll | 12 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll | 98 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll | 20 +
llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll | 90 +-
.../legalize-llvm.amdgcn.image.dim.a16.ll | 1098 ++++++---
.../legalize-llvm.amdgcn.image.load.2d.d16.ll | 94 +-
.../legalize-llvm.amdgcn.image.load.2d.ll | 228 +-
...lize-llvm.amdgcn.image.load.2darraymsaa.ll | 48 +-
.../legalize-llvm.amdgcn.image.sample.a16.ll | 2184 +++++++++++------
.../legalize-llvm.amdgcn.image.sample.d.ll | 144 +-
...galize-llvm.amdgcn.image.sample.g16.a16.ll | 108 +-
.../legalize-llvm.amdgcn.image.sample.g16.ll | 630 +++--
...legalize-llvm.amdgcn.image.store.2d.d16.ll | 135 +-
.../CodeGen/AMDGPU/GlobalISel/llvm.abs.ll | 14 +-
.../load-legalize-range-metadata.ll | 16 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll | 8 +-
.../regbankselect-amdgcn.s.buffer.load.ll | 28 +-
.../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 17 +-
.../CodeGen/AMDGPU/GlobalISel/sext_inreg.ll | 14 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll | 10 +-
.../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 17 +-
.../test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll | 3 +-
.../test/CodeGen/AMDGPU/GlobalISel/usubsat.ll | 3 +-
.../CodeGen/AMDGPU/integer-mad-patterns.ll | 25 +-
llvm/test/CodeGen/AMDGPU/llvm.exp.ll | 71 +-
llvm/test/CodeGen/AMDGPU/llvm.exp10.ll | 71 +-
llvm/test/CodeGen/AMDGPU/llvm.exp2.ll | 45 +-
llvm/test/CodeGen/AMDGPU/llvm.frexp.ll | 10 +-
llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll | 42 +-
llvm/test/CodeGen/AMDGPU/llvm.log.ll | 111 +-
llvm/test/CodeGen/AMDGPU/llvm.log10.ll | 111 +-
llvm/test/CodeGen/AMDGPU/llvm.log2.ll | 81 +-
llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll | 167 +-
llvm/test/CodeGen/AMDGPU/roundeven.ll | 84 +-
llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll | 6 +-
94 files changed, 9544 insertions(+), 4033 deletions(-)
create mode 100644 llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 9240a3c3127eb4..87409c88788e6a 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -922,6 +922,10 @@ class CombinerHelper {
bool matchUnmergeValuesAnyExtBuildVector(const MachineInstr &MI,
BuildFnTy &MatchInfo);
+ // unmerge_values(opaque vector) -> extract vector elt
+ bool matchUnmergeValuesOfScalarAndVector(const MachineInstr &MI,
+ BuildFnTy &MatchInfo);
+
private:
/// Checks for legality of an indexed variant of \p LdSt.
bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index ead4149fc11068..39dd58837d5750 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -840,6 +840,14 @@ def unmerge_anyext_build_vector : GICombineRule<
(apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])
>;
+// Transform unmerge opaque vector -> extract vector elt
+def unmerge_opaque_vector : GICombineRule<
+ (defs root:$root, build_fn_matchinfo:$matchinfo),
+ (match (wip_match_opcode G_UNMERGE_VALUES): $root,
+ [{ return Helper.matchUnmergeValuesOfScalarAndVector(*${root}, ${matchinfo}); }]),
+ (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])
+>;
+
// Transform x,y = unmerge(zext(z)) -> x = zext z; y = 0.
def unmerge_zext_to_zext : GICombineRule<
(defs root:$d),
@@ -855,7 +863,8 @@ def merge_combines: GICombineGroup<[
unmerge_cst,
unmerge_undef,
unmerge_dead_to_trunc,
- unmerge_zext_to_zext
+ unmerge_zext_to_zext,
+ unmerge_opaque_vector
]>;
// Under certain conditions, transform:
diff --git a/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt b/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
index af1717dbf76f39..a45024d120be68 100644
--- a/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
+++ b/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_component_library(LLVMGlobalISel
GlobalISel.cpp
Combiner.cpp
CombinerHelper.cpp
+ CombinerHelperArtifacts.cpp
CombinerHelperCasts.cpp
CombinerHelperCompares.cpp
CombinerHelperVectorOps.cpp
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index b7ddf9f479ef8e..f9b1621955c217 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -7611,85 +7611,3 @@ bool CombinerHelper::matchFoldAMinusC1PlusC2(const MachineInstr &MI,
return true;
}
-
-bool CombinerHelper::matchUnmergeValuesAnyExtBuildVector(const MachineInstr &MI,
- BuildFnTy &MatchInfo) {
- const GUnmerge *Unmerge = cast<GUnmerge>(&MI);
-
- if (!MRI.hasOneNonDBGUse(Unmerge->getSourceReg()))
- return false;
-
- const MachineInstr *Source = MRI.getVRegDef(Unmerge->getSourceReg());
-
- LLT DstTy = MRI.getType(Unmerge->getReg(0));
-
- // $bv:_(<8 x s8>) = G_BUILD_VECTOR ....
- // $any:_(<8 x s16>) = G_ANYEXT $bv
- // $uv:_(<4 x s16>), $uv1:_(<4 x s16>) = G_UNMERGE_VALUES $any
- //
- // ->
- //
- // $any:_(s16) = G_ANYEXT $bv[0]
- // $any1:_(s16) = G_ANYEXT $bv[1]
- // $any2:_(s16) = G_ANYEXT $bv[2]
- // $any3:_(s16) = G_ANYEXT $bv[3]
- // $any4:_(s16) = G_ANYEXT $bv[4]
- // $any5:_(s16) = G_ANYEXT $bv[5]
- // $any6:_(s16) = G_ANYEXT $bv[6]
- // $any7:_(s16) = G_ANYEXT $bv[7]
- // $uv:_(<4 x s16>) = G_BUILD_VECTOR $any, $any1, $any2, $any3
- // $uv1:_(<4 x s16>) = G_BUILD_VECTOR $any4, $any5, $any6, $any7
-
- // We want to unmerge into vectors.
- if (!DstTy.isFixedVector())
- return false;
-
- const GAnyExt *Any = dyn_cast<GAnyExt>(Source);
- if (!Any)
- return false;
-
- const MachineInstr *NextSource = MRI.getVRegDef(Any->getSrcReg());
-
- if (const GBuildVector *BV = dyn_cast<GBuildVector>(NextSource)) {
- // G_UNMERGE_VALUES G_ANYEXT G_BUILD_VECTOR
-
- if (!MRI.hasOneNonDBGUse(BV->getReg(0)))
- return false;
-
- // FIXME: check element types?
- if (BV->getNumSources() % Unmerge->getNumDefs() != 0)
- return false;
-
- LLT BigBvTy = MRI.getType(BV->getReg(0));
- LLT SmallBvTy = DstTy;
- LLT SmallBvElemenTy = SmallBvTy.getElementType();
-
- if (!isLegalOrBeforeLegalizer(
- {TargetOpcode::G_BUILD_VECTOR, {SmallBvTy, SmallBvElemenTy}}))
- return false;
-
- // We check the legality of scalar anyext.
- if (!isLegalOrBeforeLegalizer(
- {TargetOpcode::G_ANYEXT,
- {SmallBvElemenTy, BigBvTy.getElementType()}}))
- return false;
-
- MatchInfo = [=](MachineIRBuilder &B) {
- // Build into each G_UNMERGE_VALUES def
- // a small build vector with anyext from the source build vector.
- for (unsigned I = 0; I < Unmerge->getNumDefs(); ++I) {
- SmallVector<Register> Ops;
- for (unsigned J = 0; J < SmallBvTy.getNumElements(); ++J) {
- Register SourceArray =
- BV->getSourceReg(I * SmallBvTy.getNumElements() + J);
- auto AnyExt = B.buildAnyExt(SmallBvElemenTy, SourceArray);
- Ops.push_back(AnyExt.getReg(0));
- }
- B.buildBuildVector(Unmerge->getOperand(I).getReg(), Ops);
- };
- };
- return true;
- };
-
- return false;
-}
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
new file mode 100644
index 00000000000000..805d34ae0493c4
--- /dev/null
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
@@ -0,0 +1,169 @@
+//===- CombinerHelperArtifacts.cpp-----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements CombinerHelper for legalization artifacts.
+//
+//===----------------------------------------------------------------------===//
+//
+// G_UNMERGE_VALUES
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/LowLevelTypeUtils.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/Support/Casting.h"
+
+#define DEBUG_TYPE "gi-combiner"
+
+using namespace llvm;
+
+bool CombinerHelper::matchUnmergeValuesAnyExtBuildVector(const MachineInstr &MI,
+ BuildFnTy &MatchInfo) {
+ const GUnmerge *Unmerge = cast<GUnmerge>(&MI);
+
+ if (!MRI.hasOneNonDBGUse(Unmerge->getSourceReg()))
+ return false;
+
+ const MachineInstr *Source = MRI.getVRegDef(Unmerge->getSourceReg());
+
+ LLT DstTy = MRI.getType(Unmerge->getReg(0));
+
+ // $bv:_(<8 x s8>) = G_BUILD_VECTOR ....
+ // $any:_(<8 x s16>) = G_ANYEXT $bv
+ // $uv:_(<4 x s16>), $uv1:_(<4 x s16>) = G_UNMERGE_VALUES $any
+ //
+ // ->
+ //
+ // $any:_(s16) = G_ANYEXT $bv[0]
+ // $any1:_(s16) = G_ANYEXT $bv[1]
+ // $any2:_(s16) = G_ANYEXT $bv[2]
+ // $any3:_(s16) = G_ANYEXT $bv[3]
+ // $any4:_(s16) = G_ANYEXT $bv[4]
+ // $any5:_(s16) = G_ANYEXT $bv[5]
+ // $any6:_(s16) = G_ANYEXT $bv[6]
+ // $any7:_(s16) = G_ANYEXT $bv[7]
+ // $uv:_(<4 x s16>) = G_BUILD_VECTOR $any, $any1, $any2, $any3
+ // $uv1:_(<4 x s16>) = G_BUILD_VECTOR $any4, $any5, $any6, $any7
+
+ // We want to unmerge into vectors.
+ if (!DstTy.isFixedVector())
+ return false;
+
+ const GAnyExt *Any = dyn_cast<GAnyExt>(Source);
+ if (!Any)
+ return false;
+
+ const MachineInstr *NextSource = MRI.getVRegDef(Any->getSrcReg());
+
+ if (const GBuildVector *BV = dyn_cast<GBuildVector>(NextSource)) {
+ // G_UNMERGE_VALUES G_ANYEXT G_BUILD_VECTOR
+
+ if (!MRI.hasOneNonDBGUse(BV->getReg(0)))
+ return false;
+
+ // FIXME: check element types?
+ if (BV->getNumSources() % Unmerge->getNumDefs() != 0)
+ return false;
+
+ LLT BigBvTy = MRI.getType(BV->getReg(0));
+ LLT SmallBvTy = DstTy;
+ LLT SmallBvElemenTy = SmallBvTy.getElementType();
+
+ if (!isLegalOrBeforeLegalizer(
+ {TargetOpcode::G_BUILD_VECTOR, {SmallBvTy, SmallBvElemenTy}}))
+ return false;
+
+ // We check the legality of scalar anyext.
+ if (!isLegalOrBeforeLegalizer(
+ {TargetOpcode::G_ANYEXT,
+ {SmallBvElemenTy, BigBvTy.getElementType()}}))
+ return false;
+
+ MatchInfo = [=](MachineIRBuilder &B) {
+ // Build into each G_UNMERGE_VALUES def
+ // a small build vector with anyext from the source build vector.
+ for (unsigned I = 0; I < Unmerge->getNumDefs(); ++I) {
+ SmallVector<Register> Ops;
+ for (unsigned J = 0; J < SmallBvTy.getNumElements(); ++J) {
+ Register SourceArray =
+ BV->getSourceReg(I * SmallBvTy.getNumElements() + J);
+ auto AnyExt = B.buildAnyExt(SmallBvElemenTy, SourceArray);
+ Ops.push_back(AnyExt.getReg(0));
+ }
+ B.buildBuildVector(Unmerge->getOperand(I).getReg(), Ops);
+ };
+ };
+ return true;
+ };
+
+ return false;
+}
+
+bool CombinerHelper::matchUnmergeValuesOfScalarAndVector(const MachineInstr &MI,
+ BuildFnTy &MatchInfo) {
+
+ constexpr unsigned MAX_NUM_DEFS_LIMIT = 8;
+
+ // %opaque:_(<2 x s64>) = G_OPAQUE
+ // %un1:_(s64), %un2:_(s64) = G_UNMERGE_VALUES %opaque(<2 x s64>)
+ //
+ // ->
+ //
+ // %zero:_(s64) = G_CONSTANT i64 0
+ // %one:_(s64) = G_CONSTANT i64 1
+ // %un1:_(s64) = G_EXTRACT_VECTOR_ELT %opaque, $zero
+ // %un2:_(s64) = G_EXTRACT_VECTOR_ELT %opaque, $one
+
+ const GUnmerge *Unmerge = cast<GUnmerge>(&MI);
+
+ if (Unmerge->getNumDefs() > MAX_NUM_DEFS_LIMIT)
+ return false;
+
+ LLT DstTy = MRI.getType(Unmerge->getReg(0));
+ LLT SrcTy = MRI.getType(Unmerge->getSourceReg());
+
+ // We want to unmerge a vector into scalars.
+ if (!DstTy.isScalar() || !SrcTy.isFixedVector() || DstTy.getSizeInBits() > 64)
+ return false;
+
+ if (DstTy != SrcTy.getElementType())
+ return false;
+
+ // We want to unmerge from an opaque vector.
+ const MachineInstr *Source = MRI.getVRegDef(Unmerge->getSourceReg());
+ if (isa<GBuildVector>(Source))
+ return false;
+
+ unsigned PreferredVecIdxWidth =
+ getTargetLowering().getVectorIdxTy(getDataLayout()).getSizeInBits();
+
+ LLT IdxTy = LLT::scalar(PreferredVecIdxWidth);
+
+ if (!isLegalOrBeforeLegalizer(
+ {TargetOpcode::G_EXTRACT_VECTOR_ELT, {DstTy, SrcTy, IdxTy}}))
+ return false;
+
+ if (!isConstantLegalOrBeforeLegalizer(IdxTy))
+ return false;
+
+ MatchInfo = [=](MachineIRBuilder &B) {
+ for (unsigned I = 0; I < Unmerge->getNumDefs(); ++I) {
+ auto Index = B.buildConstant(IdxTy, I);
+ B.buildExtractVectorElement(Unmerge->getOperand(I).getReg(),
+ Unmerge->getSourceReg(), Index);
+ }
+ };
+
+ return true;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 8af8cdfeba6ac4..1eb7488e4ff570 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -322,7 +322,7 @@ def AArch64PostLegalizerCombiner
extractvecelt_pairwise_add, redundant_or,
mul_const, redundant_sext_inreg,
form_bitfield_extract, rotate_out_of_range,
- icmp_to_true_false_known_bits,
+ icmp_to_true_false_known_bits, vector_ops_combines,
select_combines, fold_merge_to_zext,
constant_fold_binops, identity_combines,
ptr_add_immed_chain, overlapping_and,
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
index 7566d38e6c6cfa..fc7584a2e1b162 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
@@ -422,9 +422,12 @@ body: |
; CHECK-LABEL: name: test_dont_combine_unmerge_zext_to_zext_src_vector
; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $w0
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<2 x s32>) = G_ZEXT [[COPY]](<2 x s16>)
- ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ZEXT]](<2 x s32>)
- ; CHECK-NEXT: $w0 = COPY [[UV]](s32)
- ; CHECK-NEXT: $w1 = COPY [[UV1]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[ZEXT]](<2 x s32>), [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+ ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[ZEXT]](<2 x s32>), [[C1]](s64)
+ ; CHECK-NEXT: $w0 = COPY [[EVEC]](s32)
+ ; CHECK-NEXT: $w1 = COPY [[EVEC1]](s32)
%0:_(<2 x s16>) = COPY $w0
%3:_(<2 x s32>) = G_ZEXT %0(<2 x s16>)
%1:_(s32),%2:_(s32) = G_UNMERGE_VALUES %3(<2 x s32>)
@@ -539,3 +542,98 @@ body: |
$q0 = COPY %un1(s128)
$q1 = COPY %un2(s128)
...
+
+# Check that we unmerge the opaque vector into extract vector elt
+---
+name: test_opaque_vector_scalar
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: test_opaque_vector_scalar
+ ; CHECK: %opaque:_(<2 x s64>) = COPY $q0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: %un1:_(s64) = G_EXTRACT_VECTOR_ELT %opaque(<2 x s64>), [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+ ; CHECK-NEXT: %un2:_(s64) = G_EXTRACT_VECTOR_ELT %opaque(<2 x s64>), [[C1]](s64)
+ ; CHECK-NEXT: $x0 = COPY %un1(s64)
+ ; CHECK-NEXT: $x1 = COPY %un2(s64)
+ %opaque:_(<2 x s64>) = COPY $q0
+ %un1:_(s64), %un2:_(s64) = G_UNMERGE_VALUES %opaque(<2 x s64>)
+ $x0 = COPY %un1(s64)
+ $x1 = COPY %un2(s64)
+...
+
+# Check that we don't unmerge the opaque vector into scalars
+---
+name: test_opaque_vector_vector
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: test_opaque_vector_vector
+ ; CHECK: %opaque:_(s128) = COPY $q0
+ ; CHECK-NEXT: %un1:_(s64), %un2:_(s64) = G_UNMERGE_VALUES %opaque(s128)
+ ; CHECK-NEXT: $x0 = COPY %un1(s64)
+ ; CHECK-NEXT: $x1 = COPY %un2(s64)
+ %opaque:_(s128) = COPY $q0
+ %un1:_(s64), %un2:_(s64) = G_UNMERGE_VALUES %opaque(s128)
+ $x0 = COPY %un1(s64)
+ $x1 = COPY %un2(s64)
+...
+
+# Check that we unmerge the long opaque vector into extract vector elt
+---
+name: test_long_opaque_vector_scalar
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: test_long_opaque_vector_scalar
+ ; CHECK: %opaque:_(<8 x s16>) = COPY $q0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: %un1:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+ ; CHECK-NEXT: %un2:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C1]](s64)
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+ ; CHECK-NEXT: %un3:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C2]](s64)
+ ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+ ; CHECK-NEXT: %un4:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C3]](s64)
+ ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+ ; CHECK-NEXT: %un5:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C4]](s64)
+ ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
+ ; CHECK-NEXT: %un6:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C5]](s64)
+ ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+ ; CHECK-NEXT: %un7:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C6]](s64)
+ ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 7
+ ; CHECK-NEXT: %un8:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C7]](s64)
+ ; CHECK-NEXT: %zext1:_(s32) = G_ZEXT %un1(s16)
+ ; CHECK-NEXT: %zext2:_(s32) = G_ZEXT %un2(s16)
+ ; CHECK-NEXT: %zext3:_(s32) = G_ZEXT %un3(s16)
+ ; CHECK-NEXT: %zext4:_(s32) = G_ZEXT %un4(s16)
+ ; CHECK-NEXT: %zext5:_(s32) = G_ZEXT %un5(s16)
+ ; CHECK-NEXT: %zext6:_(s32) = G_ZEXT %un6(s16)
+ ; CHECK-NEXT: %zext7:_(s32) = G_ZEXT %un7(s16)
+ ; CHECK-NEXT: %zext8:_(s32) = G_ZEXT %un8(s16)
+ ; CHECK-NEXT: $w0 = COPY %zext1(s32)
+ ; CHECK-NEXT: $w1 = COPY %zext2(s32)
+ ; CHECK-NEXT: $w0 = COPY %zext3(s32)
+ ; CHECK-NEXT: $w1 = COPY %zext4(s32)
+ ; CHECK-NEXT: $w0 = COPY %zext5(s32)
+ ; CHECK-NEXT: $w1 = COPY %zext6(s32)
+ ; CHECK-NEXT: $w0 = COPY %zext7(s32)
+ ; CHECK-NEXT: $w1 = COPY %zext8(s32)
+ %opaque:_(<8 x s16>) = COPY $q0
+ %un1:_(s16), %un2:_(s16), %un3:_(s16), %un4:_(s16), %un5:_(s16), %un6:_(s16), %un7:_(s16), %un8:_(s16) = G_UNMERGE_VALUES %opaque(<8 x s16>)
+ %zext1:_(s32) = G_ZEXT %un1
+ %zext2:_(s32) = G_ZEXT %un2
+ %zext3:_(s32) = G_ZEXT %un3
+ %zext4:_(s32) = G_ZEXT %un4
+ %zext5:_(s32) = G_ZEXT %un5
+ %zext6:_(s32) = G_ZEXT %un6
+ %zext7:_(s32) = G_ZEXT %un7
+ %zext8:_(s32) = G_ZEXT %un8
+ $w0 = COPY %zext1(s32)
+ $w1 = COPY %zext2(s32)
+ $w0 = COPY %zext3(s32)
+ $w1 = COPY %zext4(s32)
+ $w0 = COPY %zext5(s32)
+ $w1 = COPY %zext6(s32)
+ $w0 = COPY %zext7(s32)
+ $w1 = COPY %zext8(s32)
+...
+
diff --git a/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll b/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
index f7aa57a068a4ce..4d75367fa06b49 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
@@ -590,14 +590,26 @@ entry:
}
define i16 @sminv_v3i16(<3 x i16> %a) {
-; CHECK-LABEL: sminv_v3i16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: mov w8, #32767 // =0x7fff
-; CHECK-NEXT: mov v0.h[3], w8
-; CHECK-NEXT: sminv h0, v0.4h
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: sminv_v3i16:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-SD-NEXT: mov v0.h[3], w8
+; CHECK-SD-NEXT: sminv h0, v0.4h
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: sminv_v3i16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v1.h[3], w8
+; CHECK-GI-NEXT: sminv h0, v1.4h
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
entry:
%arg1 = call i16 @llvm.vector.reduce.smin.v3i16(<3 x i16> %a)
ret i16 %arg1
@@ -649,13 +661,24 @@ entry:
}
define i32 @sminv_v3i32(<3 x i32> %a) {
-; CHECK-LABEL: sminv_v3i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, #2147483647 // =0x7fffffff
-; CHECK-NEXT: mov v0.s[3], w8
-; CHECK-NEXT: sminv s0, v0.4s
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: sminv_v3i32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: mov w8, #2147483647 // =0x7fffffff
+; CHECK-SD-NEXT: mov v0.s[3], w8
+; CHECK-SD-NEXT: sminv s0, v0.4s
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: sminv_v3i32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov w8, #2147483647 // =0x7fffffff
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v1.s[3], w8
+; CHECK-GI-NEXT: sminv s0, v1.4s
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
entry:
%arg1 = call i32 @llvm.vector.reduce.smin.v3i32(<3 x i32> %a)
ret i32 %arg1
@@ -954,9 +977,12 @@ define i16 @smaxv_v3i16(<3 x i16> %a) {
; CHECK-GI-LABEL: smaxv_v3i16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
; CHECK-GI-NEXT: mov w8, #32768 // =0x8000
-; CHECK-GI-NEXT: mov v0.h[3], w8
-; CHECK-GI-NEXT: smaxv h0, v0.4h
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v1.h[3], w8
+; CHECK-GI-NEXT: smaxv h0, v1.4h
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
entry:
@@ -1010,13 +1036,24 @@ entry:
}
define i32 @smaxv_v3i32(<3 x i32> %a) {
-; CHECK-LABEL: smaxv_v3i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, #-2147483648 // =0x80000000
-; CHECK-NEXT: mov v0.s[3], w8
-; CHECK-NEXT: smaxv s0, v0.4s
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: smaxv_v3i32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: mov w8, #-2147483648 // =0x80000000
+; CHECK-SD-NEXT: mov v0.s[3], w8
+; CHECK-SD-NEXT: smaxv s0, v0.4s
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: smaxv_v3i32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov w8, #-2147483648 // =0x80000000
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v1.s[3], w8
+; CHECK-GI-NEXT: smaxv s0, v1.4s
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
entry:
%arg1 = call i32 @llvm.vector.reduce.smax.v3i32(<3 x i32> %a)
ret i32 %arg1
@@ -1313,9 +1350,12 @@ define i16 @uminv_v3i16(<3 x i16> %a) {
; CHECK-GI-LABEL: uminv_v3i16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
; CHECK-GI-NEXT: mov w8, #65535 // =0xffff
-; CHECK-GI-NEXT: mov v0.h[3], w8
-; CHECK-GI-NEXT: uminv h0, v0.4h
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v1.h[3], w8
+; CHECK-GI-NEXT: uminv h0, v1.4h
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
entry:
@@ -1369,13 +1409,24 @@ entry:
}
define i32 @uminv_v3i32(<3 x i32> %a) {
-; CHECK-LABEL: uminv_v3i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, #-1 // =0xffffffff
-; CHECK-NEXT: mov v0.s[3], w8
-; CHECK-NEXT: uminv s0, v0.4s
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: uminv_v3i32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: mov w8, #-1 // =0xffffffff
+; CHECK-SD-NEXT: mov v0.s[3], w8
+; CHECK-SD-NEXT: uminv s0, v0.4s
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: uminv_v3i32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov w8, #-1 // =0xffffffff
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v1.s[3], w8
+; CHECK-GI-NEXT: uminv s0, v1.4s
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
entry:
%arg1 = call i32 @llvm.vector.reduce.umin.v3i32(<3 x i32> %a)
ret i32 %arg1
@@ -1671,9 +1722,12 @@ define i16 @umaxv_v3i16(<3 x i16> %a) {
; CHECK-GI-LABEL: umaxv_v3i16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
; CHECK-GI-NEXT: mov w8, #0 // =0x0
-; CHECK-GI-NEXT: mov v0.h[3], w8
-; CHECK-GI-NEXT: umaxv h0, v0.4h
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v1.h[3], w8
+; CHECK-GI-NEXT: umaxv h0, v1.4h
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
entry:
@@ -1727,12 +1781,22 @@ entry:
}
define i32 @umaxv_v3i32(<3 x i32> %a) {
-; CHECK-LABEL: umaxv_v3i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov v0.s[3], wzr
-; CHECK-NEXT: umaxv s0, v0.4s
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: umaxv_v3i32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: mov v0.s[3], wzr
+; CHECK-SD-NEXT: umaxv s0, v0.4s
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: umaxv_v3i32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v1.s[3], wzr
+; CHECK-GI-NEXT: umaxv s0, v1.4s
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
entry:
%arg1 = call i32 @llvm.vector.reduce.umax.v3i32(<3 x i32> %a)
ret i32 %arg1
diff --git a/llvm/test/CodeGen/AArch64/abs.ll b/llvm/test/CodeGen/AArch64/abs.ll
index 25a14ef9a49ee8..b5794007bdddb0 100644
--- a/llvm/test/CodeGen/AArch64/abs.ll
+++ b/llvm/test/CodeGen/AArch64/abs.ll
@@ -336,9 +336,17 @@ define <3 x i8> @abs_v3i8(<3 x i8> %a){
; CHECK-GI-NEXT: mov v0.b[1], w1
; CHECK-GI-NEXT: mov v0.b[2], w2
; CHECK-GI-NEXT: abs v0.8b, v0.8b
-; CHECK-GI-NEXT: umov w0, v0.b[0]
-; CHECK-GI-NEXT: umov w1, v0.b[1]
-; CHECK-GI-NEXT: umov w2, v0.b[2]
+; CHECK-GI-NEXT: umov w8, v0.b[0]
+; CHECK-GI-NEXT: umov w9, v0.b[1]
+; CHECK-GI-NEXT: mov v1.s[0], w8
+; CHECK-GI-NEXT: umov w8, v0.b[2]
+; CHECK-GI-NEXT: mov v1.s[1], w9
+; CHECK-GI-NEXT: mov v1.s[2], w8
+; CHECK-GI-NEXT: mov s0, v1.s[1]
+; CHECK-GI-NEXT: mov s2, v1.s[2]
+; CHECK-GI-NEXT: fmov w0, s1
+; CHECK-GI-NEXT: fmov w1, s0
+; CHECK-GI-NEXT: fmov w2, s2
; CHECK-GI-NEXT: ret
entry:
%res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %a, i1 0)
@@ -347,10 +355,66 @@ entry:
declare <3 x i8> @llvm.abs.v3i8(<3 x i8>, i1)
define <7 x i8> @abs_v7i8(<7 x i8> %a){
-; CHECK-LABEL: abs_v7i8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: abs v0.8b, v0.8b
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: abs_v7i8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: abs v0.8b, v0.8b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: abs_v7i8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov b1, v0.b[1]
+; CHECK-GI-NEXT: mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT: mov b3, v0.b[2]
+; CHECK-GI-NEXT: mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT: mov b1, v0.b[3]
+; CHECK-GI-NEXT: mov v2.b[2], v3.b[0]
+; CHECK-GI-NEXT: mov b3, v0.b[4]
+; CHECK-GI-NEXT: mov v2.b[3], v1.b[0]
+; CHECK-GI-NEXT: mov b1, v0.b[5]
+; CHECK-GI-NEXT: mov b0, v0.b[6]
+; CHECK-GI-NEXT: mov v2.b[4], v3.b[0]
+; CHECK-GI-NEXT: mov v2.b[5], v1.b[0]
+; CHECK-GI-NEXT: mov v2.b[6], v0.b[0]
+; CHECK-GI-NEXT: abs v0.8b, v2.8b
+; CHECK-GI-NEXT: mov b1, v0.b[1]
+; CHECK-GI-NEXT: mov b2, v0.b[2]
+; CHECK-GI-NEXT: mov b3, v0.b[3]
+; CHECK-GI-NEXT: mov b4, v0.b[4]
+; CHECK-GI-NEXT: mov b5, v0.b[6]
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov b1, v0.b[5]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: fmov w8, s2
+; CHECK-GI-NEXT: mov v0.h[2], w8
+; CHECK-GI-NEXT: fmov w8, s3
+; CHECK-GI-NEXT: mov v0.h[3], w8
+; CHECK-GI-NEXT: fmov w8, s4
+; CHECK-GI-NEXT: mov v0.h[4], w8
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov v0.h[5], w8
+; CHECK-GI-NEXT: fmov w8, s5
+; CHECK-GI-NEXT: mov v0.h[6], w8
+; CHECK-GI-NEXT: mov h1, v0.h[1]
+; CHECK-GI-NEXT: mov h2, v0.h[3]
+; CHECK-GI-NEXT: mov h3, v0.h[4]
+; CHECK-GI-NEXT: mov h4, v0.h[5]
+; CHECK-GI-NEXT: mov h5, v0.h[6]
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov h1, v0.h[2]
+; CHECK-GI-NEXT: mov v0.b[1], w8
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov v0.b[2], w8
+; CHECK-GI-NEXT: fmov w8, s2
+; CHECK-GI-NEXT: mov v0.b[3], w8
+; CHECK-GI-NEXT: fmov w8, s3
+; CHECK-GI-NEXT: mov v0.b[4], w8
+; CHECK-GI-NEXT: fmov w8, s4
+; CHECK-GI-NEXT: mov v0.b[5], w8
+; CHECK-GI-NEXT: fmov w8, s5
+; CHECK-GI-NEXT: mov v0.b[6], w8
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
entry:
%res = call <7 x i8> @llvm.abs.v7i8(<7 x i8> %a, i1 0)
ret <7 x i8> %res
@@ -358,10 +422,30 @@ entry:
declare <7 x i8> @llvm.abs.v7i8(<7 x i8>, i1)
define <3 x i16> @abs_v3i16(<3 x i16> %a){
-; CHECK-LABEL: abs_v3i16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: abs v0.4h, v0.4h
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: abs_v3i16:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: abs v0.4h, v0.4h
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: abs_v3i16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: abs v1.4h, v1.4h
+; CHECK-GI-NEXT: umov w8, v1.h[0]
+; CHECK-GI-NEXT: umov w9, v1.h[1]
+; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: umov w8, v1.h[2]
+; CHECK-GI-NEXT: mov v0.s[1], w9
+; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[2], w9
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
entry:
%res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %a, i1 0)
ret <3 x i16> %res
@@ -369,10 +453,29 @@ entry:
declare <3 x i16> @llvm.abs.v3i16(<3 x i16>, i1)
define <7 x i16> @abs_v7i16(<7 x i16> %a){
-; CHECK-LABEL: abs_v7i16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: abs v0.8h, v0.8h
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: abs_v7i16:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: abs v0.8h, v0.8h
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: abs_v7i16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT: mov v1.h[4], v0.h[4]
+; CHECK-GI-NEXT: mov v1.h[5], v0.h[5]
+; CHECK-GI-NEXT: mov v1.h[6], v0.h[6]
+; CHECK-GI-NEXT: abs v1.8h, v1.8h
+; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT: ret
entry:
%res = call <7 x i16> @llvm.abs.v7i16(<7 x i16> %a, i1 0)
ret <7 x i16> %res
@@ -380,10 +483,21 @@ entry:
declare <7 x i16> @llvm.abs.v7i16(<7 x i16>, i1)
define <3 x i32> @abs_v3i32(<3 x i32> %a){
-; CHECK-LABEL: abs_v3i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: abs v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: abs_v3i32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: abs v0.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: abs_v3i32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: abs v1.4s, v1.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%res = call <3 x i32> @llvm.abs.v3i32(<3 x i32> %a, i1 0)
ret <3 x i32> %res
diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll
index e3072dc41d933c..5d11deaac40bee 100644
--- a/llvm/test/CodeGen/AArch64/add.ll
+++ b/llvm/test/CodeGen/AArch64/add.ll
@@ -343,10 +343,24 @@ entry:
}
define <3 x i32> @v3i32(<3 x i32> %d, <3 x i32> %e) {
-; CHECK-LABEL: v3i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: v3i32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: v3i32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%s = add <3 x i32> %d, %e
ret <3 x i32> %s
@@ -408,8 +422,9 @@ define <3 x i64> @v3i64(<3 x i64> %d, <3 x i64> %e) {
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: mov v3.d[1], v4.d[0]
; CHECK-GI-NEXT: add x8, x8, x9
-; CHECK-GI-NEXT: fmov d2, x8
+; CHECK-GI-NEXT: mov v2.d[0], x8
; CHECK-GI-NEXT: add v0.2d, v0.2d, v3.2d
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll
index 5c7429aebb31e9..70477b0c98c77a 100644
--- a/llvm/test/CodeGen/AArch64/andorxor.ll
+++ b/llvm/test/CodeGen/AArch64/andorxor.ll
@@ -1050,30 +1050,72 @@ entry:
}
define <3 x i32> @and_v3i32(<3 x i32> %d, <3 x i32> %e) {
-; CHECK-LABEL: and_v3i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: and_v3i32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: and_v3i32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT: and v1.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%s = and <3 x i32> %d, %e
ret <3 x i32> %s
}
define <3 x i32> @or_v3i32(<3 x i32> %d, <3 x i32> %e) {
-; CHECK-LABEL: or_v3i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: or_v3i32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: or_v3i32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT: orr v1.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%s = or <3 x i32> %d, %e
ret <3 x i32> %s
}
define <3 x i32> @xor_v3i32(<3 x i32> %d, <3 x i32> %e) {
-; CHECK-LABEL: xor_v3i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: xor_v3i32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: eor v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: xor_v3i32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT: eor v1.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%s = xor <3 x i32> %d, %e
ret <3 x i32> %s
@@ -1209,8 +1251,9 @@ define <3 x i64> @and_v3i64(<3 x i64> %d, <3 x i64> %e) {
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: mov v3.d[1], v4.d[0]
; CHECK-GI-NEXT: and x8, x8, x9
-; CHECK-GI-NEXT: fmov d2, x8
+; CHECK-GI-NEXT: mov v2.d[0], x8
; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
@@ -1238,8 +1281,9 @@ define <3 x i64> @or_v3i64(<3 x i64> %d, <3 x i64> %e) {
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: mov v3.d[1], v4.d[0]
; CHECK-GI-NEXT: orr x8, x8, x9
-; CHECK-GI-NEXT: fmov d2, x8
+; CHECK-GI-NEXT: mov v2.d[0], x8
; CHECK-GI-NEXT: orr v0.16b, v0.16b, v3.16b
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
@@ -1267,8 +1311,9 @@ define <3 x i64> @xor_v3i64(<3 x i64> %d, <3 x i64> %e) {
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: mov v3.d[1], v4.d[0]
; CHECK-GI-NEXT: eor x8, x8, x9
-; CHECK-GI-NEXT: fmov d2, x8
+; CHECK-GI-NEXT: mov v2.d[0], x8
; CHECK-GI-NEXT: eor v0.16b, v0.16b, v3.16b
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-fp128.ll b/llvm/test/CodeGen/AArch64/arm64-fp128.ll
index 7eb26096ed1566..8ec8ba877d854e 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fp128.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fp128.ll
@@ -1216,9 +1216,12 @@ define <2 x half> @vec_round_f16(<2 x fp128> %val) {
; CHECK-GI-NEXT: bl __trunctfhf2
; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT: bl __trunctfhf2
-; CHECK-GI-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload
; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[0]
+; CHECK-GI-NEXT: mov h0, v1.h[1]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[0]
+; CHECK-GI-NEXT: mov v0.16b, v1.16b
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll
index 39f2572d9fd354..8449b69a473d92 100644
--- a/llvm/test/CodeGen/AArch64/bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast.ll
@@ -634,10 +634,27 @@ define <8 x i64> @bitcast_v16i32_v8i64(<16 x i32> %a, <16 x i32> %b){
; ===== Vectors with Non-Pow 2 Widths =====
define <6 x i16> @bitcast_v3i32_v6i16(<3 x i32> %a, <3 x i32> %b){
-; CHECK-LABEL: bitcast_v3i32_v6i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: bitcast_v3i32_v6i16:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: bitcast_v3i32_v6i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT: ret
%c = add <3 x i32> %a, %b
%d = bitcast <3 x i32> %c to <6 x i16>
ret <6 x i16> %d
diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll
index 74e4a167ae14ca..9f9653fcbb50b5 100644
--- a/llvm/test/CodeGen/AArch64/bswap.ll
+++ b/llvm/test/CodeGen/AArch64/bswap.ll
@@ -246,10 +246,30 @@ declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>)
; ===== Vectors with Non-Pow 2 Widths =====
define <3 x i16> @bswap_v3i16(<3 x i16> %a){
-; CHECK-LABEL: bswap_v3i16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: rev16 v0.8b, v0.8b
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: bswap_v3i16:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: rev16 v0.8b, v0.8b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: bswap_v3i16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: rev16 v1.8b, v1.8b
+; CHECK-GI-NEXT: umov w8, v1.h[0]
+; CHECK-GI-NEXT: umov w9, v1.h[1]
+; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: umov w8, v1.h[2]
+; CHECK-GI-NEXT: mov v0.s[1], w9
+; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[2], w9
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
entry:
%res = call <3 x i16> @llvm.bswap.v3i16(<3 x i16> %a)
ret <3 x i16> %res
@@ -257,10 +277,29 @@ entry:
declare <3 x i16> @llvm.bswap.v3i16(<3 x i16>)
define <7 x i16> @bswap_v7i16(<7 x i16> %a){
-; CHECK-LABEL: bswap_v7i16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: rev16 v0.16b, v0.16b
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: bswap_v7i16:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: rev16 v0.16b, v0.16b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: bswap_v7i16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT: mov v1.h[4], v0.h[4]
+; CHECK-GI-NEXT: mov v1.h[5], v0.h[5]
+; CHECK-GI-NEXT: mov v1.h[6], v0.h[6]
+; CHECK-GI-NEXT: rev16 v1.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT: ret
entry:
%res = call <7 x i16> @llvm.bswap.v7i16(<7 x i16> %a)
ret <7 x i16> %res
@@ -268,10 +307,21 @@ entry:
declare <7 x i16> @llvm.bswap.v7i16(<7 x i16>)
define <3 x i32> @bswap_v3i32(<3 x i32> %a){
-; CHECK-LABEL: bswap_v3i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: rev32 v0.16b, v0.16b
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: bswap_v3i32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: rev32 v0.16b, v0.16b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: bswap_v3i32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: rev32 v1.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%res = call <3 x i32> @llvm.bswap.v3i32(<3 x i32> %a)
ret <3 x i32> %res
diff --git a/llvm/test/CodeGen/AArch64/fabs.ll b/llvm/test/CodeGen/AArch64/fabs.ll
index 43e90070736345..1aed6cb8bf9ed8 100644
--- a/llvm/test/CodeGen/AArch64/fabs.ll
+++ b/llvm/test/CodeGen/AArch64/fabs.ll
@@ -88,6 +88,7 @@ define <3 x double> @fabs_v3f64(<3 x double> %a) {
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: fabs d2, d2
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: fabs v0.2d, v0.2d
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -119,10 +120,21 @@ entry:
}
define <3 x float> @fabs_v3f32(<3 x float> %a) {
-; CHECK-LABEL: fabs_v3f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fabs v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fabs_v3f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fabs v0.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fabs_v3f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: fabs v1.4s, v1.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = call <3 x float> @llvm.fabs.v3f32(<3 x float> %a)
ret <3 x float> %c
@@ -162,13 +174,41 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
;
; CHECK-GI-NOFP16-LABEL: fabs_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: mvni v1.8h, #128, lsl #8
-; CHECK-GI-NOFP16-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[4], v0.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[5], v0.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[6], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mvni v0.8h, #128, lsl #8
+; CHECK-GI-NOFP16-NEXT: and v1.16b, v1.16b, v0.16b
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fabs_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: fabs v0.8h, v0.8h
+; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT: mov v1.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT: mov v1.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT: mov v1.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT: fabs v1.8h, v1.8h
+; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-FP16-NEXT: ret
entry:
%c = call <7 x half> @llvm.fabs.v7f16(<7 x half> %a)
diff --git a/llvm/test/CodeGen/AArch64/faddsub.ll b/llvm/test/CodeGen/AArch64/faddsub.ll
index b15579199a0598..4227c891d844f4 100644
--- a/llvm/test/CodeGen/AArch64/faddsub.ll
+++ b/llvm/test/CodeGen/AArch64/faddsub.ll
@@ -93,6 +93,7 @@ define <3 x double> @fadd_v3f64(<3 x double> %a, <3 x double> %b) {
; CHECK-GI-NEXT: fadd d2, d2, d5
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: fadd v0.2d, v0.2d, v3.2d
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -130,10 +131,24 @@ entry:
}
define <3 x float> @fadd_v3f32(<3 x float> %a, <3 x float> %b) {
-; CHECK-LABEL: fadd_v3f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fadd_v3f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fadd v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fadd_v3f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT: fadd v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = fadd <3 x float> %a, %b
ret <3 x float> %c
@@ -186,32 +201,68 @@ define <7 x half> @fadd_v7f16(<7 x half> %a, <7 x half> %b) {
;
; CHECK-GI-NOFP16-LABEL: fadd_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[0]
; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
; CHECK-GI-NOFP16-NEXT: fadd v2.4s, v2.4s, v3.4s
; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6]
; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v2.h[0]
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v1.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v0.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[4], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[5], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[6], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fadd_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: fadd v0.8h, v0.8h, v1.8h
+; CHECK-GI-FP16-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v2.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT: mov v2.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT: mov v3.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT: mov v2.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT: mov v3.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT: mov v2.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT: mov v3.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT: fadd v1.8h, v2.8h, v3.8h
+; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-FP16-NEXT: ret
entry:
%c = fadd <7 x half> %a, %b
@@ -434,6 +485,7 @@ define <3 x double> @fsub_v3f64(<3 x double> %a, <3 x double> %b) {
; CHECK-GI-NEXT: fsub d2, d2, d5
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: fsub v0.2d, v0.2d, v3.2d
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -471,10 +523,24 @@ entry:
}
define <3 x float> @fsub_v3f32(<3 x float> %a, <3 x float> %b) {
-; CHECK-LABEL: fsub_v3f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fsub v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fsub_v3f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fsub v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fsub_v3f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT: fsub v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = fsub <3 x float> %a, %b
ret <3 x float> %c
@@ -527,32 +593,68 @@ define <7 x half> @fsub_v7f16(<7 x half> %a, <7 x half> %b) {
;
; CHECK-GI-NOFP16-LABEL: fsub_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[0]
; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
; CHECK-GI-NOFP16-NEXT: fsub v2.4s, v2.4s, v3.4s
; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6]
; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v2.h[0]
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT: fsub v1.4s, v1.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: fsub v0.4s, v0.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[4], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[5], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[6], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fsub_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: fsub v0.8h, v0.8h, v1.8h
+; CHECK-GI-FP16-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v2.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT: mov v2.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT: mov v3.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT: mov v2.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT: mov v3.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT: mov v2.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT: mov v3.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT: fsub v1.8h, v2.8h, v3.8h
+; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-FP16-NEXT: ret
entry:
%c = fsub <7 x half> %a, %b
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index 66f26fc9d85973..584ffa92493d08 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -783,7 +783,8 @@ define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double>
; CHECK-GI-NEXT: ldp x20, x19, [sp, #160] // 16-byte Folded Reload
; CHECK-GI-NEXT: orr x8, x9, x8
; CHECK-GI-NEXT: orr v0.16b, v1.16b, v0.16b
-; CHECK-GI-NEXT: fmov d2, x8
+; CHECK-GI-NEXT: mov v2.d[0], x8
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: add sp, sp, #176
@@ -856,8 +857,9 @@ define <3 x double> @v3f64_double(<3 x double> %a, <3 x double> %b, <3 x double>
; CHECK-GI-NEXT: and x8, x8, x9
; CHECK-GI-NEXT: bic x9, x10, x9
; CHECK-GI-NEXT: orr x8, x8, x9
-; CHECK-GI-NEXT: fmov d2, x8
+; CHECK-GI-NEXT: mov v2.d[0], x8
; CHECK-GI-NEXT: bsl v0.16b, v6.16b, v1.16b
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
@@ -930,24 +932,33 @@ define <3 x i32> @v3f64_i32(<3 x double> %a, <3 x double> %b, <3 x i32> %d, <3 x
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: mov v3.d[1], v4.d[0]
; CHECK-GI-NEXT: mov v1.s[0], w8
+; CHECK-GI-NEXT: mov v4.s[0], v7.s[0]
; CHECK-GI-NEXT: cset w9, mi
; CHECK-GI-NEXT: mov v2.s[0], w9
; CHECK-GI-NEXT: mov w9, #-1 // =0xffffffff
; CHECK-GI-NEXT: fcmgt v0.2d, v3.2d, v0.2d
; CHECK-GI-NEXT: mov v1.s[1], w8
; CHECK-GI-NEXT: mov v3.s[0], w9
+; CHECK-GI-NEXT: mov v4.s[1], v7.s[1]
; CHECK-GI-NEXT: xtn v0.2s, v0.2d
; CHECK-GI-NEXT: mov v1.s[2], w8
; CHECK-GI-NEXT: mov v3.s[1], w9
+; CHECK-GI-NEXT: mov v4.s[2], v7.s[2]
; CHECK-GI-NEXT: mov v0.d[1], v2.d[0]
+; CHECK-GI-NEXT: mov v2.s[0], v6.s[0]
; CHECK-GI-NEXT: mov v3.s[2], w9
; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v1.4s
; CHECK-GI-NEXT: neg v1.4s, v1.4s
+; CHECK-GI-NEXT: mov v2.s[1], v6.s[1]
; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: mov v2.s[2], v6.s[2]
; CHECK-GI-NEXT: eor v1.16b, v0.16b, v3.16b
-; CHECK-GI-NEXT: and v0.16b, v6.16b, v0.16b
-; CHECK-GI-NEXT: and v1.16b, v7.16b, v1.16b
-; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: and v0.16b, v2.16b, v0.16b
+; CHECK-GI-NEXT: and v1.16b, v4.16b, v1.16b
+; CHECK-GI-NEXT: orr v1.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: ret
entry:
%c = fcmp olt <3 x double> %a, %b
@@ -1000,22 +1011,37 @@ define <3 x float> @v3f32_float(<3 x float> %a, <3 x float> %b, <3 x float> %d,
;
; CHECK-GI-LABEL: v3f32_float:
; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v4.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v5.s[0], v1.s[0]
; CHECK-GI-NEXT: mov w8, #31 // =0x1f
+; CHECK-GI-NEXT: mov v6.s[0], w8
; CHECK-GI-NEXT: mov w9, #-1 // =0xffffffff
-; CHECK-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NEXT: mov v4.s[0], w8
-; CHECK-GI-NEXT: mov v5.s[0], w9
-; CHECK-GI-NEXT: mov v4.s[1], w8
-; CHECK-GI-NEXT: mov v5.s[1], w9
-; CHECK-GI-NEXT: mov v4.s[2], w8
-; CHECK-GI-NEXT: mov v5.s[2], w9
-; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT: neg v1.4s, v4.4s
-; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT: eor v1.16b, v0.16b, v5.16b
-; CHECK-GI-NEXT: and v0.16b, v2.16b, v0.16b
-; CHECK-GI-NEXT: and v1.16b, v3.16b, v1.16b
-; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v4.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v5.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v6.s[1], w8
+; CHECK-GI-NEXT: mov v4.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v5.s[2], v1.s[2]
+; CHECK-GI-NEXT: mov v0.s[0], w9
+; CHECK-GI-NEXT: mov v6.s[2], w8
+; CHECK-GI-NEXT: fcmgt v1.4s, v5.4s, v4.4s
+; CHECK-GI-NEXT: mov v4.s[0], v2.s[0]
+; CHECK-GI-NEXT: mov v5.s[0], v3.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], w9
+; CHECK-GI-NEXT: ushl v1.4s, v1.4s, v6.4s
+; CHECK-GI-NEXT: neg v6.4s, v6.4s
+; CHECK-GI-NEXT: mov v4.s[1], v2.s[1]
+; CHECK-GI-NEXT: mov v5.s[1], v3.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], w9
+; CHECK-GI-NEXT: sshl v1.4s, v1.4s, v6.4s
+; CHECK-GI-NEXT: mov v4.s[2], v2.s[2]
+; CHECK-GI-NEXT: mov v5.s[2], v3.s[2]
+; CHECK-GI-NEXT: eor v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT: and v1.16b, v4.16b, v1.16b
+; CHECK-GI-NEXT: and v0.16b, v5.16b, v0.16b
+; CHECK-GI-NEXT: orr v1.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: ret
entry:
%c = fcmp olt <3 x float> %a, %b
@@ -1078,22 +1104,37 @@ define <3 x i32> @v3f32_i32(<3 x float> %a, <3 x float> %b, <3 x i32> %d, <3 x i
;
; CHECK-GI-LABEL: v3f32_i32:
; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v4.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v5.s[0], v1.s[0]
; CHECK-GI-NEXT: mov w8, #31 // =0x1f
+; CHECK-GI-NEXT: mov v6.s[0], w8
; CHECK-GI-NEXT: mov w9, #-1 // =0xffffffff
-; CHECK-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NEXT: mov v4.s[0], w8
-; CHECK-GI-NEXT: mov v5.s[0], w9
-; CHECK-GI-NEXT: mov v4.s[1], w8
-; CHECK-GI-NEXT: mov v5.s[1], w9
-; CHECK-GI-NEXT: mov v4.s[2], w8
-; CHECK-GI-NEXT: mov v5.s[2], w9
-; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT: neg v1.4s, v4.4s
-; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT: eor v1.16b, v0.16b, v5.16b
-; CHECK-GI-NEXT: and v0.16b, v2.16b, v0.16b
-; CHECK-GI-NEXT: and v1.16b, v3.16b, v1.16b
-; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v4.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v5.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v6.s[1], w8
+; CHECK-GI-NEXT: mov v4.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v5.s[2], v1.s[2]
+; CHECK-GI-NEXT: mov v0.s[0], w9
+; CHECK-GI-NEXT: mov v6.s[2], w8
+; CHECK-GI-NEXT: fcmgt v1.4s, v5.4s, v4.4s
+; CHECK-GI-NEXT: mov v4.s[0], v2.s[0]
+; CHECK-GI-NEXT: mov v5.s[0], v3.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], w9
+; CHECK-GI-NEXT: ushl v1.4s, v1.4s, v6.4s
+; CHECK-GI-NEXT: neg v6.4s, v6.4s
+; CHECK-GI-NEXT: mov v4.s[1], v2.s[1]
+; CHECK-GI-NEXT: mov v5.s[1], v3.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], w9
+; CHECK-GI-NEXT: sshl v1.4s, v1.4s, v6.4s
+; CHECK-GI-NEXT: mov v4.s[2], v2.s[2]
+; CHECK-GI-NEXT: mov v5.s[2], v3.s[2]
+; CHECK-GI-NEXT: eor v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT: and v1.16b, v4.16b, v1.16b
+; CHECK-GI-NEXT: and v0.16b, v5.16b, v0.16b
+; CHECK-GI-NEXT: orr v1.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: ret
entry:
%c = fcmp olt <3 x float> %a, %b
@@ -1204,70 +1245,134 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x
;
; CHECK-GI-NOFP16-LABEL: v7f16_half:
; CHECK-GI-NOFP16: // %bb.0: // %entry
+; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v1.h[0]
; CHECK-GI-NOFP16-NEXT: mov w8, #15 // =0xf
-; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v6.h[0], v1.h[4]
-; CHECK-GI-NOFP16-NEXT: fmov s5, w8
+; CHECK-GI-NOFP16-NEXT: fmov s6, w8
+; CHECK-GI-NOFP16-NEXT: mov v17.h[0], v0.h[4]
; CHECK-GI-NOFP16-NEXT: mov w9, #65535 // =0xffff
+; CHECK-GI-NOFP16-NEXT: mov v16.h[0], v1.h[4]
; CHECK-GI-NOFP16-NEXT: fmov s7, w9
-; CHECK-GI-NOFP16-NEXT: mov v5.h[1], w8
-; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v18.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v19.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v6.h[1], w8
+; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v17.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: mov v7.h[1], w9
-; CHECK-GI-NOFP16-NEXT: mov v5.h[2], w8
-; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v16.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v18.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v19.h[1], v3.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v6.h[2], w8
+; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v17.h[2], v0.h[6]
; CHECK-GI-NOFP16-NEXT: mov v7.h[2], w9
-; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: mov v5.h[3], w8
-; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v6.4s, v6.4h
+; CHECK-GI-NOFP16-NEXT: mov v16.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v18.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v19.h[2], v3.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v6.h[3], w8
+; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v17.4h
; CHECK-GI-NOFP16-NEXT: mov v7.h[3], w9
-; CHECK-GI-NOFP16-NEXT: fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT: mov v5.h[4], w8
-; CHECK-GI-NOFP16-NEXT: fcmgt v1.4s, v6.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT: mov v18.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v19.h[3], v3.h[3]
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v5.4s, v16.4h
+; CHECK-GI-NOFP16-NEXT: mov v6.h[4], w8
; CHECK-GI-NOFP16-NEXT: mov v7.h[4], w9
-; CHECK-GI-NOFP16-NEXT: mov v5.h[5], w8
-; CHECK-GI-NOFP16-NEXT: uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-GI-NOFP16-NEXT: mov v18.h[4], v2.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v19.h[4], v3.h[4]
+; CHECK-GI-NOFP16-NEXT: fcmgt v1.4s, v4.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: fcmgt v0.4s, v5.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov v6.h[5], w8
; CHECK-GI-NOFP16-NEXT: mov v7.h[5], w9
-; CHECK-GI-NOFP16-NEXT: mov v5.h[6], w8
+; CHECK-GI-NOFP16-NEXT: mov v18.h[5], v2.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v19.h[5], v3.h[5]
+; CHECK-GI-NOFP16-NEXT: uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-GI-NOFP16-NEXT: mov v6.h[6], w8
; CHECK-GI-NOFP16-NEXT: mov v7.h[6], w9
-; CHECK-GI-NOFP16-NEXT: ushl v0.8h, v0.8h, v5.8h
-; CHECK-GI-NOFP16-NEXT: neg v1.8h, v5.8h
+; CHECK-GI-NOFP16-NEXT: mov v18.h[6], v2.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v19.h[6], v3.h[6]
+; CHECK-GI-NOFP16-NEXT: ushl v0.8h, v0.8h, v6.8h
+; CHECK-GI-NOFP16-NEXT: neg v1.8h, v6.8h
; CHECK-GI-NOFP16-NEXT: sshl v0.8h, v0.8h, v1.8h
; CHECK-GI-NOFP16-NEXT: eor v1.16b, v0.16b, v7.16b
-; CHECK-GI-NOFP16-NEXT: and v0.16b, v2.16b, v0.16b
-; CHECK-GI-NOFP16-NEXT: and v1.16b, v3.16b, v1.16b
-; CHECK-GI-NOFP16-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT: and v0.16b, v18.16b, v0.16b
+; CHECK-GI-NOFP16-NEXT: and v1.16b, v19.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT: orr v1.16b, v0.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: v7f16_half:
; CHECK-GI-FP16: // %bb.0: // %entry
+; CHECK-GI-FP16-NEXT: mov v4.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT: mov v5.h[0], v1.h[0]
; CHECK-GI-FP16-NEXT: mov w8, #15 // =0xf
+; CHECK-GI-FP16-NEXT: fmov s6, w8
; CHECK-GI-FP16-NEXT: mov w9, #65535 // =0xffff
-; CHECK-GI-FP16-NEXT: fcmgt v0.8h, v1.8h, v0.8h
-; CHECK-GI-FP16-NEXT: fmov s4, w8
-; CHECK-GI-FP16-NEXT: fmov s5, w9
-; CHECK-GI-FP16-NEXT: mov v4.h[1], w8
-; CHECK-GI-FP16-NEXT: mov v5.h[1], w9
-; CHECK-GI-FP16-NEXT: mov v4.h[2], w8
-; CHECK-GI-FP16-NEXT: mov v5.h[2], w9
-; CHECK-GI-FP16-NEXT: mov v4.h[3], w8
-; CHECK-GI-FP16-NEXT: mov v5.h[3], w9
-; CHECK-GI-FP16-NEXT: mov v4.h[4], w8
-; CHECK-GI-FP16-NEXT: mov v5.h[4], w9
-; CHECK-GI-FP16-NEXT: mov v4.h[5], w8
-; CHECK-GI-FP16-NEXT: mov v5.h[5], w9
-; CHECK-GI-FP16-NEXT: mov v4.h[6], w8
-; CHECK-GI-FP16-NEXT: mov v5.h[6], w9
-; CHECK-GI-FP16-NEXT: ushl v0.8h, v0.8h, v4.8h
-; CHECK-GI-FP16-NEXT: neg v1.8h, v4.8h
+; CHECK-GI-FP16-NEXT: mov v16.h[0], v2.h[0]
+; CHECK-GI-FP16-NEXT: fmov s7, w9
+; CHECK-GI-FP16-NEXT: mov v17.h[0], v3.h[0]
+; CHECK-GI-FP16-NEXT: mov v4.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v5.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v6.h[1], w8
+; CHECK-GI-FP16-NEXT: mov v7.h[1], w9
+; CHECK-GI-FP16-NEXT: mov v16.h[1], v2.h[1]
+; CHECK-GI-FP16-NEXT: mov v17.h[1], v3.h[1]
+; CHECK-GI-FP16-NEXT: mov v4.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT: mov v5.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v6.h[2], w8
+; CHECK-GI-FP16-NEXT: mov v7.h[2], w9
+; CHECK-GI-FP16-NEXT: mov v16.h[2], v2.h[2]
+; CHECK-GI-FP16-NEXT: mov v17.h[2], v3.h[2]
+; CHECK-GI-FP16-NEXT: mov v4.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT: mov v5.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT: mov v6.h[3], w8
+; CHECK-GI-FP16-NEXT: mov v7.h[3], w9
+; CHECK-GI-FP16-NEXT: mov v16.h[3], v2.h[3]
+; CHECK-GI-FP16-NEXT: mov v17.h[3], v3.h[3]
+; CHECK-GI-FP16-NEXT: mov v4.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT: mov v5.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT: mov v6.h[4], w8
+; CHECK-GI-FP16-NEXT: mov v7.h[4], w9
+; CHECK-GI-FP16-NEXT: mov v16.h[4], v2.h[4]
+; CHECK-GI-FP16-NEXT: mov v17.h[4], v3.h[4]
+; CHECK-GI-FP16-NEXT: mov v4.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT: mov v5.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT: mov v6.h[5], w8
+; CHECK-GI-FP16-NEXT: mov v7.h[5], w9
+; CHECK-GI-FP16-NEXT: mov v16.h[5], v2.h[5]
+; CHECK-GI-FP16-NEXT: mov v17.h[5], v3.h[5]
+; CHECK-GI-FP16-NEXT: mov v4.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT: mov v5.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT: mov v6.h[6], w8
+; CHECK-GI-FP16-NEXT: mov v7.h[6], w9
+; CHECK-GI-FP16-NEXT: mov v16.h[6], v2.h[6]
+; CHECK-GI-FP16-NEXT: mov v17.h[6], v3.h[6]
+; CHECK-GI-FP16-NEXT: fcmgt v0.8h, v5.8h, v4.8h
+; CHECK-GI-FP16-NEXT: neg v1.8h, v6.8h
+; CHECK-GI-FP16-NEXT: ushl v0.8h, v0.8h, v6.8h
; CHECK-GI-FP16-NEXT: sshl v0.8h, v0.8h, v1.8h
-; CHECK-GI-FP16-NEXT: eor v1.16b, v0.16b, v5.16b
-; CHECK-GI-FP16-NEXT: and v0.16b, v2.16b, v0.16b
-; CHECK-GI-FP16-NEXT: and v1.16b, v3.16b, v1.16b
-; CHECK-GI-FP16-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-FP16-NEXT: eor v1.16b, v0.16b, v7.16b
+; CHECK-GI-FP16-NEXT: and v0.16b, v16.16b, v0.16b
+; CHECK-GI-FP16-NEXT: and v1.16b, v17.16b, v1.16b
+; CHECK-GI-FP16-NEXT: orr v1.16b, v0.16b, v1.16b
+; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-FP16-NEXT: ret
entry:
%c = fcmp olt <7 x half> %a, %b
@@ -1690,61 +1795,69 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
;
; CHECK-GI-NOFP16-LABEL: v7f16_i32:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v1.h[4]
; CHECK-GI-NOFP16-NEXT: mov w8, #31 // =0x1f
-; CHECK-GI-NOFP16-NEXT: mov v4.s[0], w8
+; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[0]
; CHECK-GI-NOFP16-NEXT: mov w9, #-1 // =0xffffffff
-; CHECK-GI-NOFP16-NEXT: mov v5.s[0], w0
-; CHECK-GI-NOFP16-NEXT: mov v6.s[0], w9
-; CHECK-GI-NOFP16-NEXT: mov v7.s[0], w7
-; CHECK-GI-NOFP16-NEXT: ldr s16, [sp]
-; CHECK-GI-NOFP16-NEXT: ldr s17, [sp, #24]
-; CHECK-GI-NOFP16-NEXT: ldr s18, [sp, #32]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v4.s[1], w8
-; CHECK-GI-NOFP16-NEXT: mov v5.s[1], w1
+; CHECK-GI-NOFP16-NEXT: mov v6.s[0], w8
+; CHECK-GI-NOFP16-NEXT: mov v16.s[0], w9
+; CHECK-GI-NOFP16-NEXT: ldr s18, [sp]
+; CHECK-GI-NOFP16-NEXT: mov v7.s[0], w0
+; CHECK-GI-NOFP16-NEXT: mov v17.s[0], w7
+; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v6.s[1], w8
+; CHECK-GI-NOFP16-NEXT: mov v16.s[1], w9
+; CHECK-GI-NOFP16-NEXT: mov v7.s[1], w1
; CHECK-GI-NOFP16-NEXT: mov v17.s[1], v18.s[0]
-; CHECK-GI-NOFP16-NEXT: mov v6.s[1], w9
-; CHECK-GI-NOFP16-NEXT: mov v7.s[1], v16.s[0]
-; CHECK-GI-NOFP16-NEXT: ldr s16, [sp, #8]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT: mov v4.s[2], w8
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: mov v5.s[2], w2
-; CHECK-GI-NOFP16-NEXT: mov v6.s[2], w9
-; CHECK-GI-NOFP16-NEXT: mov v7.s[2], v16.s[0]
-; CHECK-GI-NOFP16-NEXT: ldr s16, [sp, #40]
+; CHECK-GI-NOFP16-NEXT: ldr s18, [sp, #32]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v6.s[2], w8
+; CHECK-GI-NOFP16-NEXT: mov v16.s[2], w9
+; CHECK-GI-NOFP16-NEXT: mov v7.s[2], w2
+; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v5.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT: mov v2.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v1.s[0], w4
+; CHECK-GI-NOFP16-NEXT: mov v7.s[3], w3
+; CHECK-GI-NOFP16-NEXT: fcmgt v0.4s, v5.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT: ldr s5, [sp, #24]
+; CHECK-GI-NOFP16-NEXT: ldr s4, [sp, #8]
+; CHECK-GI-NOFP16-NEXT: mov v1.s[1], w5
; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT: mov v17.s[2], v16.s[0]
-; CHECK-GI-NOFP16-NEXT: fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT: mov v5.s[3], w3
+; CHECK-GI-NOFP16-NEXT: mov v5.s[1], v18.s[0]
+; CHECK-GI-NOFP16-NEXT: mov v17.s[2], v4.s[0]
+; CHECK-GI-NOFP16-NEXT: ldr s4, [sp, #40]
+; CHECK-GI-NOFP16-NEXT: ushl v0.4s, v0.4s, v6.4s
+; CHECK-GI-NOFP16-NEXT: neg v6.4s, v6.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.s[2], w6
; CHECK-GI-NOFP16-NEXT: fcmgt v2.4s, v3.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v3.s[0], w4
-; CHECK-GI-NOFP16-NEXT: ushl v2.4s, v2.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT: neg v4.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT: mov v3.s[1], w5
-; CHECK-GI-NOFP16-NEXT: sshl v2.4s, v2.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT: ldr s4, [sp, #16]
-; CHECK-GI-NOFP16-NEXT: mov v3.s[2], w6
-; CHECK-GI-NOFP16-NEXT: mov v7.s[3], v4.s[0]
-; CHECK-GI-NOFP16-NEXT: eor v1.16b, v2.16b, v6.16b
-; CHECK-GI-NOFP16-NEXT: and v2.16b, v3.16b, v2.16b
-; CHECK-GI-NOFP16-NEXT: and v1.16b, v17.16b, v1.16b
-; CHECK-GI-NOFP16-NEXT: bsl v0.16b, v5.16b, v7.16b
-; CHECK-GI-NOFP16-NEXT: orr v1.16b, v2.16b, v1.16b
-; CHECK-GI-NOFP16-NEXT: mov s2, v0.s[1]
-; CHECK-GI-NOFP16-NEXT: mov s3, v0.s[2]
-; CHECK-GI-NOFP16-NEXT: mov s4, v0.s[3]
-; CHECK-GI-NOFP16-NEXT: fmov w0, s0
-; CHECK-GI-NOFP16-NEXT: mov s5, v1.s[1]
-; CHECK-GI-NOFP16-NEXT: mov s6, v1.s[2]
-; CHECK-GI-NOFP16-NEXT: fmov w4, s1
-; CHECK-GI-NOFP16-NEXT: fmov w1, s2
+; CHECK-GI-NOFP16-NEXT: mov v5.s[2], v4.s[0]
+; CHECK-GI-NOFP16-NEXT: sshl v0.4s, v0.4s, v6.4s
+; CHECK-GI-NOFP16-NEXT: ldr s6, [sp, #16]
+; CHECK-GI-NOFP16-NEXT: mov v17.s[3], v6.s[0]
+; CHECK-GI-NOFP16-NEXT: eor v3.16b, v0.16b, v16.16b
+; CHECK-GI-NOFP16-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-GI-NOFP16-NEXT: and v1.16b, v5.16b, v3.16b
+; CHECK-GI-NOFP16-NEXT: bsl v2.16b, v7.16b, v17.16b
+; CHECK-GI-NOFP16-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT: mov s1, v2.s[1]
+; CHECK-GI-NOFP16-NEXT: mov s3, v2.s[2]
+; CHECK-GI-NOFP16-NEXT: mov s4, v2.s[3]
+; CHECK-GI-NOFP16-NEXT: fmov w0, s2
+; CHECK-GI-NOFP16-NEXT: mov s5, v0.s[1]
+; CHECK-GI-NOFP16-NEXT: mov s6, v0.s[2]
+; CHECK-GI-NOFP16-NEXT: fmov w4, s0
+; CHECK-GI-NOFP16-NEXT: fmov w1, s1
; CHECK-GI-NOFP16-NEXT: fmov w2, s3
; CHECK-GI-NOFP16-NEXT: fmov w3, s4
; CHECK-GI-NOFP16-NEXT: fmov w5, s5
@@ -1753,37 +1866,51 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
;
; CHECK-GI-FP16-LABEL: v7f16_i32:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: fcmgt v0.8h, v1.8h, v0.8h
+; CHECK-GI-FP16-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT: mov v3.h[0], v1.h[0]
; CHECK-GI-FP16-NEXT: mov w9, #31 // =0x1f
; CHECK-GI-FP16-NEXT: mov v4.s[0], w0
-; CHECK-GI-FP16-NEXT: mov v2.s[0], w9
; CHECK-GI-FP16-NEXT: mov v5.s[0], w7
; CHECK-GI-FP16-NEXT: ldr s6, [sp]
; CHECK-GI-FP16-NEXT: mov v7.s[0], w4
; CHECK-GI-FP16-NEXT: ldr s16, [sp, #32]
; CHECK-GI-FP16-NEXT: ldr s17, [sp, #8]
-; CHECK-GI-FP16-NEXT: umov w8, v0.h[4]
-; CHECK-GI-FP16-NEXT: umov w10, v0.h[5]
+; CHECK-GI-FP16-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v3.h[1], v1.h[1]
; CHECK-GI-FP16-NEXT: mov v4.s[1], w1
-; CHECK-GI-FP16-NEXT: mov v2.s[1], w9
; CHECK-GI-FP16-NEXT: mov v5.s[1], v6.s[0]
; CHECK-GI-FP16-NEXT: ldr s6, [sp, #24]
; CHECK-GI-FP16-NEXT: mov v7.s[1], w5
; CHECK-GI-FP16-NEXT: mov v6.s[1], v16.s[0]
; CHECK-GI-FP16-NEXT: ldr s16, [sp, #40]
-; CHECK-GI-FP16-NEXT: mov v1.s[0], w8
-; CHECK-GI-FP16-NEXT: umov w8, v0.h[6]
-; CHECK-GI-FP16-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-GI-FP16-NEXT: mov v2.s[2], w9
+; CHECK-GI-FP16-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT: mov v3.h[2], v1.h[2]
; CHECK-GI-FP16-NEXT: mov v4.s[2], w2
; CHECK-GI-FP16-NEXT: mov v5.s[2], v17.s[0]
; CHECK-GI-FP16-NEXT: mov v7.s[2], w6
-; CHECK-GI-FP16-NEXT: shl v0.4s, v0.4s, #31
; CHECK-GI-FP16-NEXT: mov v6.s[2], v16.s[0]
+; CHECK-GI-FP16-NEXT: mov v2.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT: mov v4.s[3], w3
+; CHECK-GI-FP16-NEXT: mov v2.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT: mov v3.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT: mov v2.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT: mov v3.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT: mov v2.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT: mov v3.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT: fcmgt v0.8h, v3.8h, v2.8h
+; CHECK-GI-FP16-NEXT: mov v2.s[0], w9
+; CHECK-GI-FP16-NEXT: umov w8, v0.h[4]
+; CHECK-GI-FP16-NEXT: umov w10, v0.h[5]
+; CHECK-GI-FP16-NEXT: mov v2.s[1], w9
+; CHECK-GI-FP16-NEXT: mov v1.s[0], w8
+; CHECK-GI-FP16-NEXT: umov w8, v0.h[6]
+; CHECK-GI-FP16-NEXT: mov v2.s[2], w9
+; CHECK-GI-FP16-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-FP16-NEXT: mov v1.s[1], w10
; CHECK-GI-FP16-NEXT: mov w10, #-1 // =0xffffffff
+; CHECK-GI-FP16-NEXT: shl v0.4s, v0.4s, #31
; CHECK-GI-FP16-NEXT: mov v3.s[0], w10
-; CHECK-GI-FP16-NEXT: mov v4.s[3], w3
; CHECK-GI-FP16-NEXT: sshr v0.4s, v0.4s, #31
; CHECK-GI-FP16-NEXT: mov v1.s[2], w8
; CHECK-GI-FP16-NEXT: mov v3.s[1], w10
diff --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign.ll
index a42ec8e253be29..7f07b088182cae 100644
--- a/llvm/test/CodeGen/AArch64/fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/fcopysign.ll
@@ -111,7 +111,8 @@ define <3 x double> @copysign_v3f64(<3 x double> %a, <3 x double> %b) {
; CHECK-GI-NEXT: and x9, x9, #0x8000000000000000
; CHECK-GI-NEXT: fneg v1.2d, v6.2d
; CHECK-GI-NEXT: orr x8, x8, x9
-; CHECK-GI-NEXT: fmov d2, x8
+; CHECK-GI-NEXT: mov v2.d[0], x8
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: bif v0.16b, v3.16b, v1.16b
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -156,15 +157,24 @@ define <3 x float> @copysign_v3f32(<3 x float> %a, <3 x float> %b) {
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov w8, #-2147483648 // =0x80000000
; CHECK-GI-NEXT: mov w9, #2147483647 // =0x7fffffff
-; CHECK-GI-NEXT: mov v2.s[0], w9
-; CHECK-GI-NEXT: mov v3.s[0], w8
-; CHECK-GI-NEXT: mov v2.s[1], w9
-; CHECK-GI-NEXT: mov v3.s[1], w8
-; CHECK-GI-NEXT: mov v2.s[2], w9
-; CHECK-GI-NEXT: mov v3.s[2], w8
-; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b
-; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b
-; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v3.s[0], w9
+; CHECK-GI-NEXT: mov v4.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v5.s[0], w8
+; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v3.s[1], w9
+; CHECK-GI-NEXT: mov v4.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v5.s[1], w8
+; CHECK-GI-NEXT: mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v4.s[2], v1.s[2]
+; CHECK-GI-NEXT: mov v3.s[2], w9
+; CHECK-GI-NEXT: mov v5.s[2], w8
+; CHECK-GI-NEXT: and v0.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT: and v1.16b, v4.16b, v5.16b
+; CHECK-GI-NEXT: orr v1.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: ret
entry:
%c = call <3 x float> @llvm.copysign.v3f32(<3 x float> %a, <3 x float> %b)
@@ -203,25 +213,46 @@ define <7 x half> @copysign_v7f16(<7 x half> %a, <7 x half> %b) {
;
; CHECK-GI-LABEL: copysign_v7f16:
; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v3.h[0], v1.h[0]
; CHECK-GI-NEXT: mov w8, #32768 // =0x8000
; CHECK-GI-NEXT: mov w9, #32767 // =0x7fff
-; CHECK-GI-NEXT: fmov s2, w9
-; CHECK-GI-NEXT: fmov s3, w8
-; CHECK-GI-NEXT: mov v2.h[1], w9
-; CHECK-GI-NEXT: mov v3.h[1], w8
-; CHECK-GI-NEXT: mov v2.h[2], w9
-; CHECK-GI-NEXT: mov v3.h[2], w8
-; CHECK-GI-NEXT: mov v2.h[3], w9
-; CHECK-GI-NEXT: mov v3.h[3], w8
-; CHECK-GI-NEXT: mov v2.h[4], w9
-; CHECK-GI-NEXT: mov v3.h[4], w8
-; CHECK-GI-NEXT: mov v2.h[5], w9
-; CHECK-GI-NEXT: mov v3.h[5], w8
-; CHECK-GI-NEXT: mov v2.h[6], w9
-; CHECK-GI-NEXT: mov v3.h[6], w8
-; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b
-; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b
-; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: fmov s5, w8
+; CHECK-GI-NEXT: fmov s4, w9
+; CHECK-GI-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v5.h[1], w8
+; CHECK-GI-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v4.h[1], w9
+; CHECK-GI-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v5.h[2], w8
+; CHECK-GI-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v4.h[2], w9
+; CHECK-GI-NEXT: mov v2.h[3], v0.h[3]
+; CHECK-GI-NEXT: mov v5.h[3], w8
+; CHECK-GI-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-GI-NEXT: mov v4.h[3], w9
+; CHECK-GI-NEXT: mov v2.h[4], v0.h[4]
+; CHECK-GI-NEXT: mov v5.h[4], w8
+; CHECK-GI-NEXT: mov v3.h[4], v1.h[4]
+; CHECK-GI-NEXT: mov v4.h[4], w9
+; CHECK-GI-NEXT: mov v2.h[5], v0.h[5]
+; CHECK-GI-NEXT: mov v5.h[5], w8
+; CHECK-GI-NEXT: mov v3.h[5], v1.h[5]
+; CHECK-GI-NEXT: mov v4.h[5], w9
+; CHECK-GI-NEXT: mov v2.h[6], v0.h[6]
+; CHECK-GI-NEXT: mov v3.h[6], v1.h[6]
+; CHECK-GI-NEXT: mov v5.h[6], w8
+; CHECK-GI-NEXT: mov v4.h[6], w9
+; CHECK-GI-NEXT: and v1.16b, v3.16b, v5.16b
+; CHECK-GI-NEXT: and v0.16b, v2.16b, v4.16b
+; CHECK-GI-NEXT: orr v1.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-NEXT: ret
entry:
%c = call <7 x half> @llvm.copysign.v7f16(<7 x half> %a, <7 x half> %b)
diff --git a/llvm/test/CodeGen/AArch64/fcvt.ll b/llvm/test/CodeGen/AArch64/fcvt.ll
index b408e9c1bd4e60..55d9984c6392f5 100644
--- a/llvm/test/CodeGen/AArch64/fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt.ll
@@ -84,6 +84,7 @@ define <3 x double> @ceil_v3f64(<3 x double> %a) {
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: frintp d2, d2
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: frintp v0.2d, v0.2d
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -115,10 +116,21 @@ entry:
}
define <3 x float> @ceil_v3f32(<3 x float> %a) {
-; CHECK-LABEL: ceil_v3f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: frintp v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ceil_v3f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: frintp v0.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ceil_v3f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: frintp v1.4s, v1.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = call <3 x float> @llvm.ceil.v3f32(<3 x float> %a)
ret <3 x float> %c
@@ -163,27 +175,52 @@ define <7 x half> @ceil_v7f16(<7 x half> %a) {
;
; CHECK-GI-NOFP16-LABEL: ceil_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0]
; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: frintp v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1]
; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2]
; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: frintp v2.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT: frintp v0.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT: frintp v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[4], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[5], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[6], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v3.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v3.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v3.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v3.h[6]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: ceil_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: frintp v0.8h, v0.8h
+; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT: mov v1.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT: mov v1.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT: mov v1.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT: frintp v1.8h, v1.8h
+; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-FP16-NEXT: ret
entry:
%c = call <7 x half> @llvm.ceil.v7f16(<7 x half> %a)
@@ -383,6 +420,7 @@ define <3 x double> @floor_v3f64(<3 x double> %a) {
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: frintm d2, d2
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: frintm v0.2d, v0.2d
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -414,10 +452,21 @@ entry:
}
define <3 x float> @floor_v3f32(<3 x float> %a) {
-; CHECK-LABEL: floor_v3f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: frintm v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: floor_v3f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: frintm v0.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: floor_v3f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: frintm v1.4s, v1.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = call <3 x float> @llvm.floor.v3f32(<3 x float> %a)
ret <3 x float> %c
@@ -462,27 +511,52 @@ define <7 x half> @floor_v7f16(<7 x half> %a) {
;
; CHECK-GI-NOFP16-LABEL: floor_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0]
; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: frintm v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1]
; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2]
; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: frintm v2.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT: frintm v0.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT: frintm v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[4], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[5], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[6], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v3.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v3.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v3.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v3.h[6]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: floor_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: frintm v0.8h, v0.8h
+; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT: mov v1.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT: mov v1.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT: mov v1.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT: frintm v1.8h, v1.8h
+; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-FP16-NEXT: ret
entry:
%c = call <7 x half> @llvm.floor.v7f16(<7 x half> %a)
@@ -682,6 +756,7 @@ define <3 x double> @nearbyint_v3f64(<3 x double> %a) {
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: frinti d2, d2
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: frinti v0.2d, v0.2d
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -713,10 +788,21 @@ entry:
}
define <3 x float> @nearbyint_v3f32(<3 x float> %a) {
-; CHECK-LABEL: nearbyint_v3f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: frinti v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: nearbyint_v3f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: frinti v0.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: nearbyint_v3f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: frinti v1.4s, v1.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = call <3 x float> @llvm.nearbyint.v3f32(<3 x float> %a)
ret <3 x float> %c
@@ -761,27 +847,52 @@ define <7 x half> @nearbyint_v7f16(<7 x half> %a) {
;
; CHECK-GI-NOFP16-LABEL: nearbyint_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0]
; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: frinti v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1]
; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2]
; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: frinti v2.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT: frinti v0.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT: frinti v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[4], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[5], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[6], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v3.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v3.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v3.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v3.h[6]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: nearbyint_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: frinti v0.8h, v0.8h
+; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT: mov v1.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT: mov v1.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT: mov v1.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT: frinti v1.8h, v1.8h
+; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-FP16-NEXT: ret
entry:
%c = call <7 x half> @llvm.nearbyint.v7f16(<7 x half> %a)
@@ -981,6 +1092,7 @@ define <3 x double> @roundeven_v3f64(<3 x double> %a) {
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: frintn d2, d2
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: frintn v0.2d, v0.2d
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -1012,10 +1124,21 @@ entry:
}
define <3 x float> @roundeven_v3f32(<3 x float> %a) {
-; CHECK-LABEL: roundeven_v3f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: frintn v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: roundeven_v3f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: frintn v0.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: roundeven_v3f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: frintn v1.4s, v1.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = call <3 x float> @llvm.roundeven.v3f32(<3 x float> %a)
ret <3 x float> %c
@@ -1060,27 +1183,52 @@ define <7 x half> @roundeven_v7f16(<7 x half> %a) {
;
; CHECK-GI-NOFP16-LABEL: roundeven_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0]
; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: frintn v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1]
; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2]
; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: frintn v2.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT: frintn v0.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT: frintn v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[4], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[5], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[6], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v3.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v3.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v3.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v3.h[6]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: roundeven_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: frintn v0.8h, v0.8h
+; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT: mov v1.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT: mov v1.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT: mov v1.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT: frintn v1.8h, v1.8h
+; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-FP16-NEXT: ret
entry:
%c = call <7 x half> @llvm.roundeven.v7f16(<7 x half> %a)
@@ -1280,6 +1428,7 @@ define <3 x double> @rint_v3f64(<3 x double> %a) {
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: frintx d2, d2
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: frintx v0.2d, v0.2d
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -1311,10 +1460,21 @@ entry:
}
define <3 x float> @rint_v3f32(<3 x float> %a) {
-; CHECK-LABEL: rint_v3f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: frintx v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: rint_v3f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: frintx v0.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: rint_v3f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: frintx v1.4s, v1.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = call <3 x float> @llvm.rint.v3f32(<3 x float> %a)
ret <3 x float> %c
@@ -1359,27 +1519,52 @@ define <7 x half> @rint_v7f16(<7 x half> %a) {
;
; CHECK-GI-NOFP16-LABEL: rint_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0]
; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: frintx v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1]
; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2]
; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: frintx v2.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT: frintx v0.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT: frintx v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[4], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[5], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[6], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v3.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v3.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v3.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v3.h[6]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: rint_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: frintx v0.8h, v0.8h
+; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT: mov v1.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT: mov v1.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT: mov v1.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT: frintx v1.8h, v1.8h
+; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-FP16-NEXT: ret
entry:
%c = call <7 x half> @llvm.rint.v7f16(<7 x half> %a)
@@ -1579,6 +1764,7 @@ define <3 x double> @round_v3f64(<3 x double> %a) {
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: frinta d2, d2
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: frinta v0.2d, v0.2d
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -1610,10 +1796,21 @@ entry:
}
define <3 x float> @round_v3f32(<3 x float> %a) {
-; CHECK-LABEL: round_v3f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: frinta v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: round_v3f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: frinta v0.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: round_v3f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: frinta v1.4s, v1.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = call <3 x float> @llvm.round.v3f32(<3 x float> %a)
ret <3 x float> %c
@@ -1658,27 +1855,52 @@ define <7 x half> @round_v7f16(<7 x half> %a) {
;
; CHECK-GI-NOFP16-LABEL: round_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0]
; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: frinta v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1]
; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2]
; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: frinta v2.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT: frinta v0.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT: frinta v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[4], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[5], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[6], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v3.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v3.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v3.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v3.h[6]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: round_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: frinta v0.8h, v0.8h
+; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT: mov v1.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT: mov v1.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT: mov v1.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT: frinta v1.8h, v1.8h
+; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-FP16-NEXT: ret
entry:
%c = call <7 x half> @llvm.round.v7f16(<7 x half> %a)
@@ -1878,6 +2100,7 @@ define <3 x double> @trunc_v3f64(<3 x double> %a) {
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: frintz d2, d2
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: frintz v0.2d, v0.2d
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -1909,10 +2132,21 @@ entry:
}
define <3 x float> @trunc_v3f32(<3 x float> %a) {
-; CHECK-LABEL: trunc_v3f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: frintz v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: trunc_v3f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: frintz v0.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: trunc_v3f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: frintz v1.4s, v1.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = call <3 x float> @llvm.trunc.v3f32(<3 x float> %a)
ret <3 x float> %c
@@ -1957,27 +2191,52 @@ define <7 x half> @trunc_v7f16(<7 x half> %a) {
;
; CHECK-GI-NOFP16-LABEL: trunc_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0]
; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: frintz v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1]
; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2]
; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: frintz v2.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT: frintz v0.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT: frintz v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[4], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[5], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[6], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v3.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v3.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v3.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v3.h[6]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: trunc_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: frintz v0.8h, v0.8h
+; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT: mov v1.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT: mov v1.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT: mov v1.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT: frintz v1.8h, v1.8h
+; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-FP16-NEXT: ret
entry:
%c = call <7 x half> @llvm.trunc.v7f16(<7 x half> %a)
diff --git a/llvm/test/CodeGen/AArch64/fdiv.ll b/llvm/test/CodeGen/AArch64/fdiv.ll
index 5bdccccc62b99c..9acd0166fcaa85 100644
--- a/llvm/test/CodeGen/AArch64/fdiv.ll
+++ b/llvm/test/CodeGen/AArch64/fdiv.ll
@@ -93,6 +93,7 @@ define <3 x double> @fdiv_v3f64(<3 x double> %a, <3 x double> %b) {
; CHECK-GI-NEXT: fdiv d2, d2, d5
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: fdiv v0.2d, v0.2d, v3.2d
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -130,10 +131,24 @@ entry:
}
define <3 x float> @fdiv_v3f32(<3 x float> %a, <3 x float> %b) {
-; CHECK-LABEL: fdiv_v3f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fdiv v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fdiv_v3f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fdiv v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fdiv_v3f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT: fdiv v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = fdiv <3 x float> %a, %b
ret <3 x float> %c
@@ -186,32 +201,68 @@ define <7 x half> @fdiv_v7f16(<7 x half> %a, <7 x half> %b) {
;
; CHECK-GI-NOFP16-LABEL: fdiv_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[0]
; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
; CHECK-GI-NOFP16-NEXT: fdiv v2.4s, v2.4s, v3.4s
; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6]
; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v4.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v3.4h
; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT: fdiv v1.4s, v0.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: fdiv v0.4s, v0.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[4], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[5], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[6], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fdiv_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: fdiv v0.8h, v0.8h, v1.8h
+; CHECK-GI-FP16-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v2.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT: mov v2.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT: mov v3.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT: mov v2.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT: mov v3.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT: mov v2.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT: mov v3.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT: fdiv v1.8h, v2.8h, v3.8h
+; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-FP16-NEXT: ret
entry:
%c = fdiv <7 x half> %a, %b
diff --git a/llvm/test/CodeGen/AArch64/fexplog.ll b/llvm/test/CodeGen/AArch64/fexplog.ll
index f13e2fcd1c4483..6072a2c56a06d1 100644
--- a/llvm/test/CodeGen/AArch64/fexplog.ll
+++ b/llvm/test/CodeGen/AArch64/fexplog.ll
@@ -139,29 +139,33 @@ define <3 x double> @exp_v3f64(<3 x double> %a) {
;
; CHECK-GI-LABEL: exp_v3f64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-GI-NEXT: stp d9, d8, [sp, #8] // 16-byte Folded Spill
-; CHECK-GI-NEXT: str x30, [sp, #24] // 8-byte Folded Spill
-; CHECK-GI-NEXT: .cfi_def_cfa_offset 32
-; CHECK-GI-NEXT: .cfi_offset w30, -8
-; CHECK-GI-NEXT: .cfi_offset b8, -16
-; CHECK-GI-NEXT: .cfi_offset b9, -24
-; CHECK-GI-NEXT: .cfi_offset b10, -32
+; CHECK-GI-NEXT: sub sp, sp, #64
+; CHECK-GI-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT: .cfi_offset w30, -16
+; CHECK-GI-NEXT: .cfi_offset b8, -24
+; CHECK-GI-NEXT: .cfi_offset b9, -32
; CHECK-GI-NEXT: fmov d8, d1
; CHECK-GI-NEXT: fmov d9, d2
; CHECK-GI-NEXT: bl exp
-; CHECK-GI-NEXT: fmov d10, d0
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov d0, d8
; CHECK-GI-NEXT: bl exp
-; CHECK-GI-NEXT: fmov d8, d0
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov d0, d9
; CHECK-GI-NEXT: bl exp
-; CHECK-GI-NEXT: fmov d1, d8
-; CHECK-GI-NEXT: ldp d9, d8, [sp, #8] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload
+; CHECK-GI-NEXT: ldp q1, q3, [sp] // 32-byte Folded Reload
; CHECK-GI-NEXT: fmov d2, d0
-; CHECK-GI-NEXT: fmov d0, d10
-; CHECK-GI-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT: mov d1, v3.d[1]
+; CHECK-GI-NEXT: fmov d0, d3
+; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
entry:
%c = call <3 x double> @llvm.exp.v3f64(<3 x double> %a)
@@ -355,7 +359,9 @@ define <3 x float> @exp_v3f32(<3 x float> %a) {
; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.s[1], v2.s[0]
; CHECK-GI-NEXT: mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
entry:
@@ -726,7 +732,13 @@ define <7 x half> @exp_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-NEXT: add sp, sp, #160
; CHECK-GI-NEXT: ret
entry:
@@ -1442,29 +1454,33 @@ define <3 x double> @exp2_v3f64(<3 x double> %a) {
;
; CHECK-GI-LABEL: exp2_v3f64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-GI-NEXT: stp d9, d8, [sp, #8] // 16-byte Folded Spill
-; CHECK-GI-NEXT: str x30, [sp, #24] // 8-byte Folded Spill
-; CHECK-GI-NEXT: .cfi_def_cfa_offset 32
-; CHECK-GI-NEXT: .cfi_offset w30, -8
-; CHECK-GI-NEXT: .cfi_offset b8, -16
-; CHECK-GI-NEXT: .cfi_offset b9, -24
-; CHECK-GI-NEXT: .cfi_offset b10, -32
+; CHECK-GI-NEXT: sub sp, sp, #64
+; CHECK-GI-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT: .cfi_offset w30, -16
+; CHECK-GI-NEXT: .cfi_offset b8, -24
+; CHECK-GI-NEXT: .cfi_offset b9, -32
; CHECK-GI-NEXT: fmov d8, d1
; CHECK-GI-NEXT: fmov d9, d2
; CHECK-GI-NEXT: bl exp2
-; CHECK-GI-NEXT: fmov d10, d0
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov d0, d8
; CHECK-GI-NEXT: bl exp2
-; CHECK-GI-NEXT: fmov d8, d0
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov d0, d9
; CHECK-GI-NEXT: bl exp2
-; CHECK-GI-NEXT: fmov d1, d8
-; CHECK-GI-NEXT: ldp d9, d8, [sp, #8] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload
+; CHECK-GI-NEXT: ldp q1, q3, [sp] // 32-byte Folded Reload
; CHECK-GI-NEXT: fmov d2, d0
-; CHECK-GI-NEXT: fmov d0, d10
-; CHECK-GI-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT: mov d1, v3.d[1]
+; CHECK-GI-NEXT: fmov d0, d3
+; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
entry:
%c = call <3 x double> @llvm.exp2.v3f64(<3 x double> %a)
@@ -1658,7 +1674,9 @@ define <3 x float> @exp2_v3f32(<3 x float> %a) {
; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.s[1], v2.s[0]
; CHECK-GI-NEXT: mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
entry:
@@ -2029,7 +2047,13 @@ define <7 x half> @exp2_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-NEXT: add sp, sp, #160
; CHECK-GI-NEXT: ret
entry:
@@ -2745,29 +2769,33 @@ define <3 x double> @log_v3f64(<3 x double> %a) {
;
; CHECK-GI-LABEL: log_v3f64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-GI-NEXT: stp d9, d8, [sp, #8] // 16-byte Folded Spill
-; CHECK-GI-NEXT: str x30, [sp, #24] // 8-byte Folded Spill
-; CHECK-GI-NEXT: .cfi_def_cfa_offset 32
-; CHECK-GI-NEXT: .cfi_offset w30, -8
-; CHECK-GI-NEXT: .cfi_offset b8, -16
-; CHECK-GI-NEXT: .cfi_offset b9, -24
-; CHECK-GI-NEXT: .cfi_offset b10, -32
+; CHECK-GI-NEXT: sub sp, sp, #64
+; CHECK-GI-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT: .cfi_offset w30, -16
+; CHECK-GI-NEXT: .cfi_offset b8, -24
+; CHECK-GI-NEXT: .cfi_offset b9, -32
; CHECK-GI-NEXT: fmov d8, d1
; CHECK-GI-NEXT: fmov d9, d2
; CHECK-GI-NEXT: bl log
-; CHECK-GI-NEXT: fmov d10, d0
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov d0, d8
; CHECK-GI-NEXT: bl log
-; CHECK-GI-NEXT: fmov d8, d0
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov d0, d9
; CHECK-GI-NEXT: bl log
-; CHECK-GI-NEXT: fmov d1, d8
-; CHECK-GI-NEXT: ldp d9, d8, [sp, #8] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload
+; CHECK-GI-NEXT: ldp q1, q3, [sp] // 32-byte Folded Reload
; CHECK-GI-NEXT: fmov d2, d0
-; CHECK-GI-NEXT: fmov d0, d10
-; CHECK-GI-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT: mov d1, v3.d[1]
+; CHECK-GI-NEXT: fmov d0, d3
+; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
entry:
%c = call <3 x double> @llvm.log.v3f64(<3 x double> %a)
@@ -2961,7 +2989,9 @@ define <3 x float> @log_v3f32(<3 x float> %a) {
; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.s[1], v2.s[0]
; CHECK-GI-NEXT: mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
entry:
@@ -3332,7 +3362,13 @@ define <7 x half> @log_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-NEXT: add sp, sp, #160
; CHECK-GI-NEXT: ret
entry:
@@ -4048,29 +4084,33 @@ define <3 x double> @log2_v3f64(<3 x double> %a) {
;
; CHECK-GI-LABEL: log2_v3f64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-GI-NEXT: stp d9, d8, [sp, #8] // 16-byte Folded Spill
-; CHECK-GI-NEXT: str x30, [sp, #24] // 8-byte Folded Spill
-; CHECK-GI-NEXT: .cfi_def_cfa_offset 32
-; CHECK-GI-NEXT: .cfi_offset w30, -8
-; CHECK-GI-NEXT: .cfi_offset b8, -16
-; CHECK-GI-NEXT: .cfi_offset b9, -24
-; CHECK-GI-NEXT: .cfi_offset b10, -32
+; CHECK-GI-NEXT: sub sp, sp, #64
+; CHECK-GI-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT: .cfi_offset w30, -16
+; CHECK-GI-NEXT: .cfi_offset b8, -24
+; CHECK-GI-NEXT: .cfi_offset b9, -32
; CHECK-GI-NEXT: fmov d8, d1
; CHECK-GI-NEXT: fmov d9, d2
; CHECK-GI-NEXT: bl log2
-; CHECK-GI-NEXT: fmov d10, d0
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov d0, d8
; CHECK-GI-NEXT: bl log2
-; CHECK-GI-NEXT: fmov d8, d0
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov d0, d9
; CHECK-GI-NEXT: bl log2
-; CHECK-GI-NEXT: fmov d1, d8
-; CHECK-GI-NEXT: ldp d9, d8, [sp, #8] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload
+; CHECK-GI-NEXT: ldp q1, q3, [sp] // 32-byte Folded Reload
; CHECK-GI-NEXT: fmov d2, d0
-; CHECK-GI-NEXT: fmov d0, d10
-; CHECK-GI-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT: mov d1, v3.d[1]
+; CHECK-GI-NEXT: fmov d0, d3
+; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
entry:
%c = call <3 x double> @llvm.log2.v3f64(<3 x double> %a)
@@ -4264,7 +4304,9 @@ define <3 x float> @log2_v3f32(<3 x float> %a) {
; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.s[1], v2.s[0]
; CHECK-GI-NEXT: mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
entry:
@@ -4635,7 +4677,13 @@ define <7 x half> @log2_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-NEXT: add sp, sp, #160
; CHECK-GI-NEXT: ret
entry:
@@ -5351,29 +5399,33 @@ define <3 x double> @log10_v3f64(<3 x double> %a) {
;
; CHECK-GI-LABEL: log10_v3f64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-GI-NEXT: stp d9, d8, [sp, #8] // 16-byte Folded Spill
-; CHECK-GI-NEXT: str x30, [sp, #24] // 8-byte Folded Spill
-; CHECK-GI-NEXT: .cfi_def_cfa_offset 32
-; CHECK-GI-NEXT: .cfi_offset w30, -8
-; CHECK-GI-NEXT: .cfi_offset b8, -16
-; CHECK-GI-NEXT: .cfi_offset b9, -24
-; CHECK-GI-NEXT: .cfi_offset b10, -32
+; CHECK-GI-NEXT: sub sp, sp, #64
+; CHECK-GI-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT: .cfi_offset w30, -16
+; CHECK-GI-NEXT: .cfi_offset b8, -24
+; CHECK-GI-NEXT: .cfi_offset b9, -32
; CHECK-GI-NEXT: fmov d8, d1
; CHECK-GI-NEXT: fmov d9, d2
; CHECK-GI-NEXT: bl log10
-; CHECK-GI-NEXT: fmov d10, d0
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov d0, d8
; CHECK-GI-NEXT: bl log10
-; CHECK-GI-NEXT: fmov d8, d0
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov d0, d9
; CHECK-GI-NEXT: bl log10
-; CHECK-GI-NEXT: fmov d1, d8
-; CHECK-GI-NEXT: ldp d9, d8, [sp, #8] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload
+; CHECK-GI-NEXT: ldp q1, q3, [sp] // 32-byte Folded Reload
; CHECK-GI-NEXT: fmov d2, d0
-; CHECK-GI-NEXT: fmov d0, d10
-; CHECK-GI-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT: mov d1, v3.d[1]
+; CHECK-GI-NEXT: fmov d0, d3
+; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
entry:
%c = call <3 x double> @llvm.log10.v3f64(<3 x double> %a)
@@ -5567,7 +5619,9 @@ define <3 x float> @log10_v3f32(<3 x float> %a) {
; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.s[1], v2.s[0]
; CHECK-GI-NEXT: mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
entry:
@@ -5938,7 +5992,13 @@ define <7 x half> @log10_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-NEXT: add sp, sp, #160
; CHECK-GI-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
index bbfec8c7c33617..83b6f3c26f34c6 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
@@ -16,8 +16,12 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec
;
; CHECK-GI-LABEL: vector_deinterleave_v2f16_v4f16:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: uzp1 v2.4h, v0.4h, v0.4h
-; CHECK-GI-NEXT: uzp2 v1.4h, v0.4h, v0.4h
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[1]
+; CHECK-GI-NEXT: mov v2.h[1], v0.h[2]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[3]
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 killed $q1
; CHECK-GI-NEXT: fmov d0, d2
; CHECK-GI-NEXT: ret
%retval = call {<2 x half>, <2 x half>} @llvm.vector.deinterleave2.v4f16(<4 x half> %vec)
diff --git a/llvm/test/CodeGen/AArch64/fminimummaximum.ll b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
index fb12f8acf17453..c2e91a9956af91 100644
--- a/llvm/test/CodeGen/AArch64/fminimummaximum.ll
+++ b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
@@ -154,6 +154,7 @@ define <3 x double> @min_v3f64(<3 x double> %a, <3 x double> %b) {
; CHECK-GI-NEXT: fmin d2, d2, d5
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: fmin v0.2d, v0.2d, v3.2d
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -191,6 +192,7 @@ define <3 x double> @max_v3f64(<3 x double> %a, <3 x double> %b) {
; CHECK-GI-NEXT: fmax d2, d2, d5
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: fmax v0.2d, v0.2d, v3.2d
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -255,20 +257,48 @@ entry:
}
define <3 x float> @min_v3f32(<3 x float> %a, <3 x float> %b) {
-; CHECK-LABEL: min_v3f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: min_v3f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fmin v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: min_v3f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT: fmin v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b)
ret <3 x float> %c
}
define <3 x float> @max_v3f32(<3 x float> %a, <3 x float> %b) {
-; CHECK-LABEL: max_v3f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: max_v3f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fmax v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: max_v3f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT: fmax v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b)
ret <3 x float> %c
@@ -662,32 +692,68 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
;
; CHECK-NOFP16-GI-LABEL: min_v7f16:
; CHECK-NOFP16-GI: // %bb.0: // %entry
-; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v0.4h
-; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v1.4h
+; CHECK-NOFP16-GI-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[0], v1.h[0]
; CHECK-NOFP16-GI-NEXT: mov v4.h[0], v0.h[4]
+; CHECK-NOFP16-GI-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v0.h[5]
+; CHECK-NOFP16-GI-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6]
+; CHECK-NOFP16-GI-NEXT: mov v2.h[3], v0.h[3]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-NOFP16-GI-NEXT: fcvtl v0.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v2.4h
+; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h
; CHECK-NOFP16-GI-NEXT: fmin v2.4s, v2.4s, v3.4s
; CHECK-NOFP16-GI-NEXT: mov v3.h[0], v1.h[4]
-; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v0.h[5]
; CHECK-NOFP16-GI-NEXT: mov v3.h[1], v1.h[5]
; CHECK-NOFP16-GI-NEXT: fcvtn v2.4h, v2.4s
-; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6]
; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v2.h[0]
-; CHECK-NOFP16-GI-NEXT: fcvtl v1.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT: mov v1.h[0], v2.h[0]
; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h
-; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v2.h[1]
-; CHECK-NOFP16-GI-NEXT: fmin v1.4s, v1.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v2.h[2]
-; CHECK-NOFP16-GI-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v2.h[3]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v1.h[1]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[2]
+; CHECK-NOFP16-GI-NEXT: mov v1.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT: fmin v0.4s, v0.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT: mov v1.h[2], v2.h[2]
+; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-NOFP16-GI-NEXT: mov v1.h[3], v2.h[3]
+; CHECK-NOFP16-GI-NEXT: mov v1.h[4], v0.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v1.h[5], v0.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v1.h[6], v0.h[2]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[6]
; CHECK-NOFP16-GI-NEXT: ret
;
; CHECK-FP16-GI-LABEL: min_v7f16:
; CHECK-FP16-GI: // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT: fmin v0.8h, v0.8h, v1.8h
+; CHECK-FP16-GI-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-FP16-GI-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-FP16-GI-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-FP16-GI-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-FP16-GI-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-FP16-GI-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-FP16-GI-NEXT: mov v2.h[3], v0.h[3]
+; CHECK-FP16-GI-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-FP16-GI-NEXT: mov v2.h[4], v0.h[4]
+; CHECK-FP16-GI-NEXT: mov v3.h[4], v1.h[4]
+; CHECK-FP16-GI-NEXT: mov v2.h[5], v0.h[5]
+; CHECK-FP16-GI-NEXT: mov v3.h[5], v1.h[5]
+; CHECK-FP16-GI-NEXT: mov v2.h[6], v0.h[6]
+; CHECK-FP16-GI-NEXT: mov v3.h[6], v1.h[6]
+; CHECK-FP16-GI-NEXT: fmin v1.8h, v2.8h, v3.8h
+; CHECK-FP16-GI-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-FP16-GI-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-FP16-GI-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-FP16-GI-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-FP16-GI-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-FP16-GI-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-FP16-GI-NEXT: mov v0.h[6], v1.h[6]
; CHECK-FP16-GI-NEXT: ret
entry:
%c = call <7 x half> @llvm.minimum.v7f16(<7 x half> %a, <7 x half> %b)
@@ -760,32 +826,68 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
;
; CHECK-NOFP16-GI-LABEL: max_v7f16:
; CHECK-NOFP16-GI: // %bb.0: // %entry
-; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v0.4h
-; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v1.4h
+; CHECK-NOFP16-GI-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[0], v1.h[0]
; CHECK-NOFP16-GI-NEXT: mov v4.h[0], v0.h[4]
+; CHECK-NOFP16-GI-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v0.h[5]
+; CHECK-NOFP16-GI-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6]
+; CHECK-NOFP16-GI-NEXT: mov v2.h[3], v0.h[3]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-NOFP16-GI-NEXT: fcvtl v0.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v2.4h
+; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h
; CHECK-NOFP16-GI-NEXT: fmax v2.4s, v2.4s, v3.4s
; CHECK-NOFP16-GI-NEXT: mov v3.h[0], v1.h[4]
-; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v0.h[5]
; CHECK-NOFP16-GI-NEXT: mov v3.h[1], v1.h[5]
; CHECK-NOFP16-GI-NEXT: fcvtn v2.4h, v2.4s
-; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6]
; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v2.h[0]
-; CHECK-NOFP16-GI-NEXT: fcvtl v1.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT: mov v1.h[0], v2.h[0]
; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h
-; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v2.h[1]
-; CHECK-NOFP16-GI-NEXT: fmax v1.4s, v1.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v2.h[2]
-; CHECK-NOFP16-GI-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v2.h[3]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v1.h[1]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[2]
+; CHECK-NOFP16-GI-NEXT: mov v1.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT: fmax v0.4s, v0.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT: mov v1.h[2], v2.h[2]
+; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-NOFP16-GI-NEXT: mov v1.h[3], v2.h[3]
+; CHECK-NOFP16-GI-NEXT: mov v1.h[4], v0.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v1.h[5], v0.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v1.h[6], v0.h[2]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[6]
; CHECK-NOFP16-GI-NEXT: ret
;
; CHECK-FP16-GI-LABEL: max_v7f16:
; CHECK-FP16-GI: // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT: fmax v0.8h, v0.8h, v1.8h
+; CHECK-FP16-GI-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-FP16-GI-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-FP16-GI-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-FP16-GI-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-FP16-GI-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-FP16-GI-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-FP16-GI-NEXT: mov v2.h[3], v0.h[3]
+; CHECK-FP16-GI-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-FP16-GI-NEXT: mov v2.h[4], v0.h[4]
+; CHECK-FP16-GI-NEXT: mov v3.h[4], v1.h[4]
+; CHECK-FP16-GI-NEXT: mov v2.h[5], v0.h[5]
+; CHECK-FP16-GI-NEXT: mov v3.h[5], v1.h[5]
+; CHECK-FP16-GI-NEXT: mov v2.h[6], v0.h[6]
+; CHECK-FP16-GI-NEXT: mov v3.h[6], v1.h[6]
+; CHECK-FP16-GI-NEXT: fmax v1.8h, v2.8h, v3.8h
+; CHECK-FP16-GI-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-FP16-GI-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-FP16-GI-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-FP16-GI-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-FP16-GI-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-FP16-GI-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-FP16-GI-NEXT: mov v0.h[6], v1.h[6]
; CHECK-FP16-GI-NEXT: ret
entry:
%c = call <7 x half> @llvm.maximum.v7f16(<7 x half> %a, <7 x half> %b)
diff --git a/llvm/test/CodeGen/AArch64/fminmax.ll b/llvm/test/CodeGen/AArch64/fminmax.ll
index 64f0da8b4cd0f9..b7af6be8721d68 100644
--- a/llvm/test/CodeGen/AArch64/fminmax.ll
+++ b/llvm/test/CodeGen/AArch64/fminmax.ll
@@ -154,6 +154,7 @@ define <3 x double> @min_v3f64(<3 x double> %a, <3 x double> %b) {
; CHECK-GI-NEXT: fminnm d2, d2, d5
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: fminnm v0.2d, v0.2d, v3.2d
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -191,6 +192,7 @@ define <3 x double> @max_v3f64(<3 x double> %a, <3 x double> %b) {
; CHECK-GI-NEXT: fmaxnm d2, d2, d5
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: fmaxnm v0.2d, v0.2d, v3.2d
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -255,20 +257,48 @@ entry:
}
define <3 x float> @min_v3f32(<3 x float> %a, <3 x float> %b) {
-; CHECK-LABEL: min_v3f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fminnm v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: min_v3f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fminnm v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: min_v3f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT: fminnm v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = call <3 x float> @llvm.minnum.v3f32(<3 x float> %a, <3 x float> %b)
ret <3 x float> %c
}
define <3 x float> @max_v3f32(<3 x float> %a, <3 x float> %b) {
-; CHECK-LABEL: max_v3f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: max_v3f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fmaxnm v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: max_v3f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT: fmaxnm v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = call <3 x float> @llvm.maxnum.v3f32(<3 x float> %a, <3 x float> %b)
ret <3 x float> %c
@@ -662,32 +692,68 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
;
; CHECK-NOFP16-GI-LABEL: min_v7f16:
; CHECK-NOFP16-GI: // %bb.0: // %entry
-; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v0.4h
-; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v1.4h
+; CHECK-NOFP16-GI-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[0], v1.h[0]
; CHECK-NOFP16-GI-NEXT: mov v4.h[0], v0.h[4]
+; CHECK-NOFP16-GI-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v0.h[5]
+; CHECK-NOFP16-GI-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6]
+; CHECK-NOFP16-GI-NEXT: mov v2.h[3], v0.h[3]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-NOFP16-GI-NEXT: fcvtl v0.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v2.4h
+; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h
; CHECK-NOFP16-GI-NEXT: fminnm v2.4s, v2.4s, v3.4s
; CHECK-NOFP16-GI-NEXT: mov v3.h[0], v1.h[4]
-; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v0.h[5]
; CHECK-NOFP16-GI-NEXT: mov v3.h[1], v1.h[5]
; CHECK-NOFP16-GI-NEXT: fcvtn v2.4h, v2.4s
-; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6]
; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v2.h[0]
-; CHECK-NOFP16-GI-NEXT: fcvtl v1.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT: mov v1.h[0], v2.h[0]
; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h
-; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v2.h[1]
-; CHECK-NOFP16-GI-NEXT: fminnm v1.4s, v1.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v2.h[2]
-; CHECK-NOFP16-GI-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v2.h[3]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v1.h[1]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[2]
+; CHECK-NOFP16-GI-NEXT: mov v1.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT: fminnm v0.4s, v0.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT: mov v1.h[2], v2.h[2]
+; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-NOFP16-GI-NEXT: mov v1.h[3], v2.h[3]
+; CHECK-NOFP16-GI-NEXT: mov v1.h[4], v0.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v1.h[5], v0.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v1.h[6], v0.h[2]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[6]
; CHECK-NOFP16-GI-NEXT: ret
;
; CHECK-FP16-GI-LABEL: min_v7f16:
; CHECK-FP16-GI: // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT: fminnm v0.8h, v0.8h, v1.8h
+; CHECK-FP16-GI-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-FP16-GI-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-FP16-GI-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-FP16-GI-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-FP16-GI-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-FP16-GI-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-FP16-GI-NEXT: mov v2.h[3], v0.h[3]
+; CHECK-FP16-GI-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-FP16-GI-NEXT: mov v2.h[4], v0.h[4]
+; CHECK-FP16-GI-NEXT: mov v3.h[4], v1.h[4]
+; CHECK-FP16-GI-NEXT: mov v2.h[5], v0.h[5]
+; CHECK-FP16-GI-NEXT: mov v3.h[5], v1.h[5]
+; CHECK-FP16-GI-NEXT: mov v2.h[6], v0.h[6]
+; CHECK-FP16-GI-NEXT: mov v3.h[6], v1.h[6]
+; CHECK-FP16-GI-NEXT: fminnm v1.8h, v2.8h, v3.8h
+; CHECK-FP16-GI-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-FP16-GI-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-FP16-GI-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-FP16-GI-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-FP16-GI-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-FP16-GI-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-FP16-GI-NEXT: mov v0.h[6], v1.h[6]
; CHECK-FP16-GI-NEXT: ret
entry:
%c = call <7 x half> @llvm.minnum.v7f16(<7 x half> %a, <7 x half> %b)
@@ -760,32 +826,68 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
;
; CHECK-NOFP16-GI-LABEL: max_v7f16:
; CHECK-NOFP16-GI: // %bb.0: // %entry
-; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v0.4h
-; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v1.4h
+; CHECK-NOFP16-GI-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[0], v1.h[0]
; CHECK-NOFP16-GI-NEXT: mov v4.h[0], v0.h[4]
+; CHECK-NOFP16-GI-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v0.h[5]
+; CHECK-NOFP16-GI-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6]
+; CHECK-NOFP16-GI-NEXT: mov v2.h[3], v0.h[3]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-NOFP16-GI-NEXT: fcvtl v0.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v2.4h
+; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h
; CHECK-NOFP16-GI-NEXT: fmaxnm v2.4s, v2.4s, v3.4s
; CHECK-NOFP16-GI-NEXT: mov v3.h[0], v1.h[4]
-; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v0.h[5]
; CHECK-NOFP16-GI-NEXT: mov v3.h[1], v1.h[5]
; CHECK-NOFP16-GI-NEXT: fcvtn v2.4h, v2.4s
-; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6]
; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v2.h[0]
-; CHECK-NOFP16-GI-NEXT: fcvtl v1.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT: mov v1.h[0], v2.h[0]
; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h
-; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v2.h[1]
-; CHECK-NOFP16-GI-NEXT: fmaxnm v1.4s, v1.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v2.h[2]
-; CHECK-NOFP16-GI-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v2.h[3]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v1.h[1]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[2]
+; CHECK-NOFP16-GI-NEXT: mov v1.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT: fmaxnm v0.4s, v0.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT: mov v1.h[2], v2.h[2]
+; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-NOFP16-GI-NEXT: mov v1.h[3], v2.h[3]
+; CHECK-NOFP16-GI-NEXT: mov v1.h[4], v0.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v1.h[5], v0.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v1.h[6], v0.h[2]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[6]
; CHECK-NOFP16-GI-NEXT: ret
;
; CHECK-FP16-GI-LABEL: max_v7f16:
; CHECK-FP16-GI: // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT: fmaxnm v0.8h, v0.8h, v1.8h
+; CHECK-FP16-GI-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-FP16-GI-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-FP16-GI-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-FP16-GI-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-FP16-GI-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-FP16-GI-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-FP16-GI-NEXT: mov v2.h[3], v0.h[3]
+; CHECK-FP16-GI-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-FP16-GI-NEXT: mov v2.h[4], v0.h[4]
+; CHECK-FP16-GI-NEXT: mov v3.h[4], v1.h[4]
+; CHECK-FP16-GI-NEXT: mov v2.h[5], v0.h[5]
+; CHECK-FP16-GI-NEXT: mov v3.h[5], v1.h[5]
+; CHECK-FP16-GI-NEXT: mov v2.h[6], v0.h[6]
+; CHECK-FP16-GI-NEXT: mov v3.h[6], v1.h[6]
+; CHECK-FP16-GI-NEXT: fmaxnm v1.8h, v2.8h, v3.8h
+; CHECK-FP16-GI-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-FP16-GI-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-FP16-GI-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-FP16-GI-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-FP16-GI-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-FP16-GI-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-FP16-GI-NEXT: mov v0.h[6], v1.h[6]
; CHECK-FP16-GI-NEXT: ret
entry:
%c = call <7 x half> @llvm.maxnum.v7f16(<7 x half> %a, <7 x half> %b)
diff --git a/llvm/test/CodeGen/AArch64/fmla.ll b/llvm/test/CodeGen/AArch64/fmla.ll
index 7bcaae5a77eac5..0a9d4c7b657e06 100644
--- a/llvm/test/CodeGen/AArch64/fmla.ll
+++ b/llvm/test/CodeGen/AArch64/fmla.ll
@@ -105,6 +105,7 @@ define <3 x double> @fma_v3f64(<3 x double> %a, <3 x double> %b, <3 x double> %c
; CHECK-GI-NEXT: fmla v6.2d, v3.2d, v0.2d
; CHECK-GI-NEXT: ldr d0, [sp]
; CHECK-GI-NEXT: fmadd d2, d2, d5, d0
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: mov d1, v6.d[1]
; CHECK-GI-NEXT: fmov d0, d6
; CHECK-GI-NEXT: ret
@@ -138,11 +139,28 @@ entry:
}
define <3 x float> @fma_v3f32(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
-; CHECK-LABEL: fma_v3f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmla v2.4s, v1.4s, v0.4s
-; CHECK-NEXT: mov v0.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fma_v3f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fmla v2.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT: mov v0.16b, v2.16b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fma_v3f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v3.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v4.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v5.s[0], v2.s[0]
+; CHECK-GI-NEXT: mov v3.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v4.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v5.s[1], v2.s[1]
+; CHECK-GI-NEXT: mov v3.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v4.s[2], v1.s[2]
+; CHECK-GI-NEXT: mov v5.s[2], v2.s[2]
+; CHECK-GI-NEXT: fmla v5.4s, v4.4s, v3.4s
+; CHECK-GI-NEXT: mov v0.s[0], v5.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v5.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v5.s[2]
+; CHECK-GI-NEXT: ret
entry:
%d = call <3 x float> @llvm.fma.v3f32(<3 x float> %a, <3 x float> %b, <3 x float> %c)
ret <3 x float> %d
@@ -254,38 +272,84 @@ define <7 x half> @fma_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
;
; CHECK-GI-NOFP16-LABEL: fma_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v5.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v2.h[0]
; CHECK-GI-NOFP16-NEXT: mov v6.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v6.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v5.4s, v5.4h
; CHECK-GI-NOFP16-NEXT: fmla v5.4s, v4.4s, v3.4s
; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[4]
; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v2.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[5]
; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v2.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v5.4h, v5.4s
-; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v0.h[6]
; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6]
; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v2.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v5.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v6.4h
+; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v5.h[0]
; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v3.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v5.h[1]
-; CHECK-GI-NOFP16-NEXT: fmla v3.4s, v2.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v5.h[2]
-; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v3.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v5.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v5.h[1]
+; CHECK-GI-NOFP16-NEXT: fmla v3.4s, v2.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v5.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v3.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v5.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[4], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[5], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[6], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fma_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: fmla v2.8h, v1.8h, v0.8h
-; CHECK-GI-FP16-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-FP16-NEXT: mov v3.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT: mov v4.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT: mov v5.h[0], v2.h[0]
+; CHECK-GI-FP16-NEXT: mov v3.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v4.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v5.h[1], v2.h[1]
+; CHECK-GI-FP16-NEXT: mov v3.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT: mov v4.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v5.h[2], v2.h[2]
+; CHECK-GI-FP16-NEXT: mov v3.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT: mov v4.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT: mov v5.h[3], v2.h[3]
+; CHECK-GI-FP16-NEXT: mov v3.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT: mov v4.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT: mov v5.h[4], v2.h[4]
+; CHECK-GI-FP16-NEXT: mov v3.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT: mov v4.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT: mov v5.h[5], v2.h[5]
+; CHECK-GI-FP16-NEXT: mov v3.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT: mov v4.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT: mov v5.h[6], v2.h[6]
+; CHECK-GI-FP16-NEXT: fmla v5.8h, v4.8h, v3.8h
+; CHECK-GI-FP16-NEXT: mov v0.h[0], v5.h[0]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v5.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.h[2], v5.h[2]
+; CHECK-GI-FP16-NEXT: mov v0.h[3], v5.h[3]
+; CHECK-GI-FP16-NEXT: mov v0.h[4], v5.h[4]
+; CHECK-GI-FP16-NEXT: mov v0.h[5], v5.h[5]
+; CHECK-GI-FP16-NEXT: mov v0.h[6], v5.h[6]
; CHECK-GI-FP16-NEXT: ret
entry:
%d = call <7 x half> @llvm.fma.v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c)
@@ -756,6 +820,7 @@ define <3 x double> @fmuladd_v3f64(<3 x double> %a, <3 x double> %b, <3 x double
; CHECK-GI-NEXT: fmla v6.2d, v3.2d, v0.2d
; CHECK-GI-NEXT: ldr d0, [sp]
; CHECK-GI-NEXT: fmadd d2, d2, d5, d0
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: mov d1, v6.d[1]
; CHECK-GI-NEXT: fmov d0, d6
; CHECK-GI-NEXT: ret
@@ -789,11 +854,28 @@ entry:
}
define <3 x float> @fmuladd_v3f32(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
-; CHECK-LABEL: fmuladd_v3f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmla v2.4s, v1.4s, v0.4s
-; CHECK-NEXT: mov v0.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fmuladd_v3f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fmla v2.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT: mov v0.16b, v2.16b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fmuladd_v3f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v3.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v4.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v5.s[0], v2.s[0]
+; CHECK-GI-NEXT: mov v3.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v4.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v5.s[1], v2.s[1]
+; CHECK-GI-NEXT: mov v3.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v4.s[2], v1.s[2]
+; CHECK-GI-NEXT: mov v5.s[2], v2.s[2]
+; CHECK-GI-NEXT: fmla v5.4s, v4.4s, v3.4s
+; CHECK-GI-NEXT: mov v0.s[0], v5.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v5.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v5.s[2]
+; CHECK-GI-NEXT: ret
entry:
%d = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %a, <3 x float> %b, <3 x float> %c)
ret <3 x float> %d
@@ -852,44 +934,90 @@ define <7 x half> @fmuladd_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
;
; CHECK-GI-NOFP16-LABEL: fmuladd_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v6.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h
; CHECK-GI-NOFP16-NEXT: fmul v3.4s, v3.4s, v4.4s
; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v1.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v1.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v3.4h, v3.4s
-; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v0.h[6]
; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v1.h[6]
; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v6.4h
; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v2.h[4]
; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h
; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v0.4s, v1.4s
; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v2.h[5]
; CHECK-GI-NOFP16-NEXT: fmul v1.4s, v3.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT: fcvtn v3.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v2.h[6]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v5.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1]
-; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v1.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[2]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v3.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[4], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[6], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[6]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fmuladd_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: fmla v2.8h, v1.8h, v0.8h
-; CHECK-GI-FP16-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-FP16-NEXT: mov v3.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT: mov v4.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT: mov v5.h[0], v2.h[0]
+; CHECK-GI-FP16-NEXT: mov v3.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v4.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v5.h[1], v2.h[1]
+; CHECK-GI-FP16-NEXT: mov v3.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT: mov v4.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v5.h[2], v2.h[2]
+; CHECK-GI-FP16-NEXT: mov v3.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT: mov v4.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT: mov v5.h[3], v2.h[3]
+; CHECK-GI-FP16-NEXT: mov v3.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT: mov v4.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT: mov v5.h[4], v2.h[4]
+; CHECK-GI-FP16-NEXT: mov v3.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT: mov v4.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT: mov v5.h[5], v2.h[5]
+; CHECK-GI-FP16-NEXT: mov v3.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT: mov v4.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT: mov v5.h[6], v2.h[6]
+; CHECK-GI-FP16-NEXT: fmla v5.8h, v4.8h, v3.8h
+; CHECK-GI-FP16-NEXT: mov v0.h[0], v5.h[0]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v5.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.h[2], v5.h[2]
+; CHECK-GI-FP16-NEXT: mov v0.h[3], v5.h[3]
+; CHECK-GI-FP16-NEXT: mov v0.h[4], v5.h[4]
+; CHECK-GI-FP16-NEXT: mov v0.h[5], v5.h[5]
+; CHECK-GI-FP16-NEXT: mov v0.h[6], v5.h[6]
; CHECK-GI-FP16-NEXT: ret
entry:
%d = call <7 x half> @llvm.fmuladd.v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c)
@@ -1204,6 +1332,7 @@ define <3 x double> @fmul_v3f64(<3 x double> %a, <3 x double> %b, <3 x double> %
; CHECK-GI-NEXT: fmla v6.2d, v0.2d, v3.2d
; CHECK-GI-NEXT: ldr d0, [sp]
; CHECK-GI-NEXT: fmadd d2, d2, d5, d0
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: mov d1, v6.d[1]
; CHECK-GI-NEXT: fmov d0, d6
; CHECK-GI-NEXT: ret
@@ -1262,8 +1391,19 @@ define <3 x float> @fmul_v3f32(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
;
; CHECK-GI-LABEL: fmul_v3f32:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmla v2.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-NEXT: mov v3.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v4.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v5.s[0], v2.s[0]
+; CHECK-GI-NEXT: mov v3.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v4.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v5.s[1], v2.s[1]
+; CHECK-GI-NEXT: mov v3.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v4.s[2], v1.s[2]
+; CHECK-GI-NEXT: mov v5.s[2], v2.s[2]
+; CHECK-GI-NEXT: fmla v5.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT: mov v0.s[0], v5.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v5.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v5.s[2]
; CHECK-GI-NEXT: ret
entry:
%d = fmul fast <3 x float> %a, %b
@@ -1340,44 +1480,90 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
;
; CHECK-GI-NOFP16-LABEL: fmul_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v6.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h
; CHECK-GI-NOFP16-NEXT: fmul v3.4s, v3.4s, v4.4s
; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v1.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v1.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v3.4h, v3.4s
-; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v0.h[6]
; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v1.h[6]
; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v6.4h
; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v2.h[4]
; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h
; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v0.4s, v1.4s
; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v2.h[5]
; CHECK-GI-NOFP16-NEXT: fmul v1.4s, v3.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT: fcvtn v3.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v2.h[6]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v5.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1]
-; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v1.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[2]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v3.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[4], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[6], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[6]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fmul_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: fmla v2.8h, v0.8h, v1.8h
-; CHECK-GI-FP16-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-FP16-NEXT: mov v3.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT: mov v4.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT: mov v5.h[0], v2.h[0]
+; CHECK-GI-FP16-NEXT: mov v3.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v4.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v5.h[1], v2.h[1]
+; CHECK-GI-FP16-NEXT: mov v3.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT: mov v4.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v5.h[2], v2.h[2]
+; CHECK-GI-FP16-NEXT: mov v3.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT: mov v4.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT: mov v5.h[3], v2.h[3]
+; CHECK-GI-FP16-NEXT: mov v3.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT: mov v4.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT: mov v5.h[4], v2.h[4]
+; CHECK-GI-FP16-NEXT: mov v3.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT: mov v4.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT: mov v5.h[5], v2.h[5]
+; CHECK-GI-FP16-NEXT: mov v3.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT: mov v4.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT: mov v5.h[6], v2.h[6]
+; CHECK-GI-FP16-NEXT: fmla v5.8h, v3.8h, v4.8h
+; CHECK-GI-FP16-NEXT: mov v0.h[0], v5.h[0]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v5.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.h[2], v5.h[2]
+; CHECK-GI-FP16-NEXT: mov v0.h[3], v5.h[3]
+; CHECK-GI-FP16-NEXT: mov v0.h[4], v5.h[4]
+; CHECK-GI-FP16-NEXT: mov v0.h[5], v5.h[5]
+; CHECK-GI-FP16-NEXT: mov v0.h[6], v5.h[6]
; CHECK-GI-FP16-NEXT: ret
entry:
%d = fmul fast <7 x half> %a, %b
diff --git a/llvm/test/CodeGen/AArch64/fmul.ll b/llvm/test/CodeGen/AArch64/fmul.ll
index bd3d1353e643e5..de6618ac18f157 100644
--- a/llvm/test/CodeGen/AArch64/fmul.ll
+++ b/llvm/test/CodeGen/AArch64/fmul.ll
@@ -93,6 +93,7 @@ define <3 x double> @fmul_v3f64(<3 x double> %a, <3 x double> %b) {
; CHECK-GI-NEXT: fmul d2, d2, d5
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: fmul v0.2d, v0.2d, v3.2d
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -130,10 +131,24 @@ entry:
}
define <3 x float> @fmul_v3f32(<3 x float> %a, <3 x float> %b) {
-; CHECK-LABEL: fmul_v3f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmul v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fmul_v3f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fmul_v3f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT: fmul v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = fmul <3 x float> %a, %b
ret <3 x float> %c
@@ -186,32 +201,68 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b) {
;
; CHECK-GI-NOFP16-LABEL: fmul_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[0]
; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
; CHECK-GI-NOFP16-NEXT: fmul v2.4s, v2.4s, v3.4s
; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6]
; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v2.h[0]
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT: fmul v1.4s, v1.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: fmul v0.4s, v0.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[4], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[5], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[6], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fmul_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: fmul v0.8h, v0.8h, v1.8h
+; CHECK-GI-FP16-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v2.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT: mov v2.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT: mov v3.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT: mov v2.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT: mov v3.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT: mov v2.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT: mov v3.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT: fmul v1.8h, v2.8h, v3.8h
+; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-FP16-NEXT: ret
entry:
%c = fmul <7 x half> %a, %b
diff --git a/llvm/test/CodeGen/AArch64/fneg.ll b/llvm/test/CodeGen/AArch64/fneg.ll
index de2671afe60ab7..dd6266e8b3b1f4 100644
--- a/llvm/test/CodeGen/AArch64/fneg.ll
+++ b/llvm/test/CodeGen/AArch64/fneg.ll
@@ -88,6 +88,7 @@ define <3 x double> @fabs_v3f64(<3 x double> %a) {
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: fneg d2, d2
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: fneg v0.2d, v0.2d
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -119,10 +120,21 @@ entry:
}
define <3 x float> @fabs_v3f32(<3 x float> %a) {
-; CHECK-LABEL: fabs_v3f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fneg v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fabs_v3f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fneg v0.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fabs_v3f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: fneg v1.4s, v1.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = fneg <3 x float> %a
ret <3 x float> %c
@@ -163,13 +175,41 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
;
; CHECK-GI-NOFP16-LABEL: fabs_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
+; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[0]
; CHECK-GI-NOFP16-NEXT: movi v1.8h, #128, lsl #8
-; CHECK-GI-NOFP16-NEXT: eor v0.16b, v0.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[3], v0.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[4], v0.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[5], v0.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[6], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: eor v1.16b, v2.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fabs_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: fneg v0.8h, v0.8h
+; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT: mov v1.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT: mov v1.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT: mov v1.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT: fneg v1.8h, v1.8h
+; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-FP16-NEXT: ret
entry:
%c = fneg <7 x half> %a
diff --git a/llvm/test/CodeGen/AArch64/fpext.ll b/llvm/test/CodeGen/AArch64/fpext.ll
index df90f9d5f09109..7a30b68be6eae2 100644
--- a/llvm/test/CodeGen/AArch64/fpext.ll
+++ b/llvm/test/CodeGen/AArch64/fpext.ll
@@ -82,9 +82,12 @@ define <3 x double> @fpext_v3f32_v3f64(<3 x float> %a) {
;
; CHECK-GI-LABEL: fpext_v3f32_v3f64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov s1, v0.s[2]
-; CHECK-GI-NEXT: fcvtl v0.2d, v0.2s
-; CHECK-GI-NEXT: fcvt d2, s1
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov s2, v0.s[2]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: fcvt d2, s2
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT: fcvtl v0.2d, v1.2s
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
@@ -355,10 +358,14 @@ define <3 x double> @fpext_v3f16_v3f64(<3 x half> %a) {
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NEXT: mov h2, v0.h[2]
-; CHECK-GI-NEXT: fcvt d0, h0
+; CHECK-GI-NEXT: fcvt d3, h0
+; CHECK-GI-NEXT: mov h0, v0.h[2]
; CHECK-GI-NEXT: fcvt d1, h1
-; CHECK-GI-NEXT: fcvt d2, h2
+; CHECK-GI-NEXT: fcvt d2, h0
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT: mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT: mov d1, v3.d[1]
+; CHECK-GI-NEXT: fmov d0, d3
; CHECK-GI-NEXT: ret
entry:
%c = fpext <3 x half> %a to <3 x double>
@@ -403,10 +410,22 @@ entry:
}
define <3 x float> @fpext_v3f16_v3f32(<3 x half> %a) {
-; CHECK-LABEL: fpext_v3f16_v3f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtl v0.4s, v0.4h
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fpext_v3f16_v3f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fpext_v3f16_v3f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: fcvtl v1.4s, v1.4h
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = fpext <3 x half> %a to <3 x float>
ret <3 x float> %c
diff --git a/llvm/test/CodeGen/AArch64/fpow.ll b/llvm/test/CodeGen/AArch64/fpow.ll
index dc93d5be9b3f38..fb7efe82582322 100644
--- a/llvm/test/CodeGen/AArch64/fpow.ll
+++ b/llvm/test/CodeGen/AArch64/fpow.ll
@@ -156,38 +156,42 @@ define <3 x double> @pow_v3f64(<3 x double> %a, <3 x double> %b) {
;
; CHECK-GI-LABEL: pow_v3f64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: str d12, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-GI-NEXT: stp d11, d10, [sp, #8] // 16-byte Folded Spill
-; CHECK-GI-NEXT: stp d9, d8, [sp, #24] // 16-byte Folded Spill
-; CHECK-GI-NEXT: str x30, [sp, #40] // 8-byte Folded Spill
-; CHECK-GI-NEXT: .cfi_def_cfa_offset 48
-; CHECK-GI-NEXT: .cfi_offset w30, -8
-; CHECK-GI-NEXT: .cfi_offset b8, -16
-; CHECK-GI-NEXT: .cfi_offset b9, -24
-; CHECK-GI-NEXT: .cfi_offset b10, -32
-; CHECK-GI-NEXT: .cfi_offset b11, -40
-; CHECK-GI-NEXT: .cfi_offset b12, -48
+; CHECK-GI-NEXT: sub sp, sp, #80
+; CHECK-GI-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 80
+; CHECK-GI-NEXT: .cfi_offset w30, -16
+; CHECK-GI-NEXT: .cfi_offset b8, -24
+; CHECK-GI-NEXT: .cfi_offset b9, -32
+; CHECK-GI-NEXT: .cfi_offset b10, -40
+; CHECK-GI-NEXT: .cfi_offset b11, -48
; CHECK-GI-NEXT: fmov d8, d1
; CHECK-GI-NEXT: fmov d1, d3
; CHECK-GI-NEXT: fmov d9, d2
; CHECK-GI-NEXT: fmov d10, d4
; CHECK-GI-NEXT: fmov d11, d5
; CHECK-GI-NEXT: bl pow
-; CHECK-GI-NEXT: fmov d12, d0
-; CHECK-GI-NEXT: fmov d0, d8
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov d1, d10
+; CHECK-GI-NEXT: fmov d0, d8
; CHECK-GI-NEXT: bl pow
-; CHECK-GI-NEXT: fmov d8, d0
-; CHECK-GI-NEXT: fmov d0, d9
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov d1, d11
+; CHECK-GI-NEXT: fmov d0, d9
; CHECK-GI-NEXT: bl pow
-; CHECK-GI-NEXT: fmov d1, d8
-; CHECK-GI-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldp d11, d10, [sp, #8] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldp q1, q3, [sp] // 32-byte Folded Reload
; CHECK-GI-NEXT: fmov d2, d0
-; CHECK-GI-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload
-; CHECK-GI-NEXT: fmov d0, d12
-; CHECK-GI-NEXT: ldr d12, [sp], #48 // 8-byte Folded Reload
+; CHECK-GI-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-GI-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT: mov d1, v3.d[1]
+; CHECK-GI-NEXT: fmov d0, d3
+; CHECK-GI-NEXT: add sp, sp, #80
; CHECK-GI-NEXT: ret
entry:
%c = call <3 x double> @llvm.pow.v3f64(<3 x double> %a, <3 x double> %b)
@@ -419,7 +423,9 @@ define <3 x float> @pow_v3f32(<3 x float> %a, <3 x float> %b) {
; CHECK-GI-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.s[1], v2.s[0]
; CHECK-GI-NEXT: mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: add sp, sp, #80
; CHECK-GI-NEXT: ret
entry:
@@ -879,7 +885,13 @@ define <7 x half> @pow_v7f16(<7 x half> %a, <7 x half> %b) {
; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-NEXT: add sp, sp, #176
; CHECK-GI-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/fpowi.ll b/llvm/test/CodeGen/AArch64/fpowi.ll
index 8948556d1b380a..3f122ee06d99a9 100644
--- a/llvm/test/CodeGen/AArch64/fpowi.ll
+++ b/llvm/test/CodeGen/AArch64/fpowi.ll
@@ -149,33 +149,37 @@ define <3 x double> @powi_v3f64(<3 x double> %a, i32 %b) {
;
; CHECK-GI-LABEL: powi_v3f64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: str d10, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-GI-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT: stp x30, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT: .cfi_def_cfa_offset 48
+; CHECK-GI-NEXT: sub sp, sp, #64
+; CHECK-GI-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp x30, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 64
; CHECK-GI-NEXT: .cfi_offset w19, -8
; CHECK-GI-NEXT: .cfi_offset w30, -16
; CHECK-GI-NEXT: .cfi_offset b8, -24
; CHECK-GI-NEXT: .cfi_offset b9, -32
-; CHECK-GI-NEXT: .cfi_offset b10, -48
; CHECK-GI-NEXT: fmov d8, d1
; CHECK-GI-NEXT: fmov d9, d2
; CHECK-GI-NEXT: mov w19, w0
; CHECK-GI-NEXT: bl __powidf2
-; CHECK-GI-NEXT: fmov d10, d0
-; CHECK-GI-NEXT: fmov d0, d8
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT: mov w0, w19
+; CHECK-GI-NEXT: fmov d0, d8
; CHECK-GI-NEXT: bl __powidf2
-; CHECK-GI-NEXT: fmov d8, d0
-; CHECK-GI-NEXT: fmov d0, d9
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: mov w0, w19
+; CHECK-GI-NEXT: fmov d0, d9
; CHECK-GI-NEXT: bl __powidf2
-; CHECK-GI-NEXT: fmov d1, d8
-; CHECK-GI-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldp q1, q3, [sp] // 32-byte Folded Reload
; CHECK-GI-NEXT: fmov d2, d0
-; CHECK-GI-NEXT: fmov d0, d10
-; CHECK-GI-NEXT: ldr d10, [sp], #48 // 8-byte Folded Reload
+; CHECK-GI-NEXT: ldp x30, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT: mov d1, v3.d[1]
+; CHECK-GI-NEXT: fmov d0, d3
+; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
entry:
%c = call <3 x double> @llvm.powi.v3f64.i32(<3 x double> %a, i32 %b)
@@ -393,7 +397,9 @@ define <3 x float> @powi_v3f32(<3 x float> %a, i32 %b) {
; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.s[1], v2.s[0]
; CHECK-GI-NEXT: mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
entry:
@@ -809,7 +815,13 @@ define <7 x half> @powi_v7f16(<7 x half> %a, i32 %b) {
; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-NEXT: add sp, sp, #160
; CHECK-GI-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll
index 9c4f0207b84ce8..1ab72b7dc0056f 100644
--- a/llvm/test/CodeGen/AArch64/fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/fptoi.ll
@@ -1015,32 +1015,60 @@ entry:
}
define <3 x i32> @fptos_v3f64_v3i32(<3 x double> %a) {
-; CHECK-LABEL: fptos_v3f64_v3i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: fcvtzs v1.2d, v2.2d
-; CHECK-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fptos_v3f64_v3i32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT: fcvtzs v1.2d, v2.2d
+; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fptos_v3f64_v3i32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT: fcvtzs v1.2d, v2.2d
+; CHECK-GI-NEXT: fcvtzs v0.2d, v0.2d
+; CHECK-GI-NEXT: uzp1 v1.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = fptosi <3 x double> %a to <3 x i32>
ret <3 x i32> %c
}
define <3 x i32> @fptou_v3f64_v3i32(<3 x double> %a) {
-; CHECK-LABEL: fptou_v3f64_v3i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: fcvtzu v1.2d, v2.2d
-; CHECK-NEXT: fcvtzu v0.2d, v0.2d
-; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fptou_v3f64_v3i32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT: fcvtzu v1.2d, v2.2d
+; CHECK-SD-NEXT: fcvtzu v0.2d, v0.2d
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fptou_v3f64_v3i32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT: fcvtzu v1.2d, v2.2d
+; CHECK-GI-NEXT: fcvtzu v0.2d, v0.2d
+; CHECK-GI-NEXT: uzp1 v1.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = fptoui <3 x double> %a to <3 x i32>
ret <3 x i32> %c
@@ -1375,17 +1403,33 @@ entry:
}
define <3 x i16> @fptos_v3f64_v3i16(<3 x double> %a) {
-; CHECK-LABEL: fptos_v3f64_v3i16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: fcvtzs v1.2d, v2.2d
-; CHECK-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: xtn v0.4h, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fptos_v3f64_v3i16:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT: fcvtzs v1.2d, v2.2d
+; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: xtn v0.4h, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fptos_v3f64_v3i16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT: fcvtzs v1.2d, v2.2d
+; CHECK-GI-NEXT: fcvtzs v0.2d, v0.2d
+; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[2], w9
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
entry:
%c = fptosi <3 x double> %a to <3 x i16>
ret <3 x i16> %c
@@ -1413,7 +1457,11 @@ define <3 x i16> @fptou_v3f64_v3i16(<3 x double> %a) {
; CHECK-GI-NEXT: fcvtzu v1.2d, v2.2d
; CHECK-GI-NEXT: fcvtzu v0.2d, v0.2d
; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT: xtn v0.4h, v0.4s
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[2], w9
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
%c = fptoui <3 x double> %a to <3 x i16>
@@ -1876,15 +1924,18 @@ define <3 x i8> @fptos_v3f64_v3i8(<3 x double> %a) {
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT: fcvtzs v1.2d, v2.2d
; CHECK-GI-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-GI-NEXT: fmov x2, d1
-; CHECK-GI-NEXT: // kill: def $w2 killed $w2 killed $x2
-; CHECK-GI-NEXT: mov d2, v0.d[1]
-; CHECK-GI-NEXT: fmov x0, d0
-; CHECK-GI-NEXT: // kill: def $w0 killed $w0 killed $x0
-; CHECK-GI-NEXT: fmov x1, d2
-; CHECK-GI-NEXT: // kill: def $w1 killed $w1 killed $x1
+; CHECK-GI-NEXT: mov d1, v0.d[1]
+; CHECK-GI-NEXT: mov v0.s[0], v0.s[0]
+; CHECK-GI-NEXT: fmov x8, d1
+; CHECK-GI-NEXT: fcvtzs v1.2d, v2.2d
+; CHECK-GI-NEXT: mov v0.s[1], w8
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT: mov s1, v0.s[1]
+; CHECK-GI-NEXT: mov s2, v0.s[2]
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: fmov w1, s1
+; CHECK-GI-NEXT: fmov w2, s2
; CHECK-GI-NEXT: ret
entry:
%c = fptosi <3 x double> %a to <3 x i8>
@@ -1913,15 +1964,18 @@ define <3 x i8> @fptou_v3f64_v3i8(<3 x double> %a) {
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT: fcvtzu v1.2d, v2.2d
; CHECK-GI-NEXT: fcvtzu v0.2d, v0.2d
-; CHECK-GI-NEXT: fmov x2, d1
-; CHECK-GI-NEXT: // kill: def $w2 killed $w2 killed $x2
-; CHECK-GI-NEXT: mov d2, v0.d[1]
-; CHECK-GI-NEXT: fmov x0, d0
-; CHECK-GI-NEXT: // kill: def $w0 killed $w0 killed $x0
-; CHECK-GI-NEXT: fmov x1, d2
-; CHECK-GI-NEXT: // kill: def $w1 killed $w1 killed $x1
+; CHECK-GI-NEXT: mov d1, v0.d[1]
+; CHECK-GI-NEXT: mov v0.s[0], v0.s[0]
+; CHECK-GI-NEXT: fmov x8, d1
+; CHECK-GI-NEXT: fcvtzu v1.2d, v2.2d
+; CHECK-GI-NEXT: mov v0.s[1], w8
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT: mov s1, v0.s[1]
+; CHECK-GI-NEXT: mov s2, v0.s[2]
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: fmov w1, s1
+; CHECK-GI-NEXT: fmov w2, s2
; CHECK-GI-NEXT: ret
entry:
%c = fptoui <3 x double> %a to <3 x i8>
@@ -2585,14 +2639,16 @@ define <3 x i64> @fptos_v3f32_v3i64(<3 x float> %a) {
;
; CHECK-GI-LABEL: fptos_v3f32_v3i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v1.s[0], v0.s[2]
-; CHECK-GI-NEXT: fcvtl v0.2d, v0.2s
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v0.s[0], v0.s[2]
; CHECK-GI-NEXT: fcvtl v1.2d, v1.2s
-; CHECK-GI-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-GI-NEXT: fcvtzs v2.2d, v1.2d
+; CHECK-GI-NEXT: fcvtl v2.2d, v0.2s
+; CHECK-GI-NEXT: fcvtzs v0.2d, v1.2d
+; CHECK-GI-NEXT: fcvtzs v2.2d, v2.2d
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: ret
entry:
%c = fptosi <3 x float> %a to <3 x i64>
@@ -2614,14 +2670,16 @@ define <3 x i64> @fptou_v3f32_v3i64(<3 x float> %a) {
;
; CHECK-GI-LABEL: fptou_v3f32_v3i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v1.s[0], v0.s[2]
-; CHECK-GI-NEXT: fcvtl v0.2d, v0.2s
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v0.s[0], v0.s[2]
; CHECK-GI-NEXT: fcvtl v1.2d, v1.2s
-; CHECK-GI-NEXT: fcvtzu v0.2d, v0.2d
-; CHECK-GI-NEXT: fcvtzu v2.2d, v1.2d
+; CHECK-GI-NEXT: fcvtl v2.2d, v0.2s
+; CHECK-GI-NEXT: fcvtzu v0.2d, v1.2d
+; CHECK-GI-NEXT: fcvtzu v2.2d, v2.2d
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: ret
entry:
%c = fptoui <3 x float> %a to <3 x i64>
@@ -3025,20 +3083,42 @@ entry:
}
define <3 x i32> @fptos_v3f32_v3i32(<3 x float> %a) {
-; CHECK-LABEL: fptos_v3f32_v3i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzs v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fptos_v3f32_v3i32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fcvtzs v0.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fptos_v3f32_v3i32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: fcvtzs v1.4s, v1.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = fptosi <3 x float> %a to <3 x i32>
ret <3 x i32> %c
}
define <3 x i32> @fptou_v3f32_v3i32(<3 x float> %a) {
-; CHECK-LABEL: fptou_v3f32_v3i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzu v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fptou_v3f32_v3i32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fcvtzu v0.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fptou_v3f32_v3i32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: fcvtzu v1.4s, v1.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = fptoui <3 x float> %a to <3 x i32>
ret <3 x i32> %c
@@ -3172,22 +3252,48 @@ entry:
}
define <3 x i16> @fptos_v3f32_v3i16(<3 x float> %a) {
-; CHECK-LABEL: fptos_v3f32_v3i16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzs v0.4s, v0.4s
-; CHECK-NEXT: xtn v0.4h, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fptos_v3f32_v3i16:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fcvtzs v0.4s, v0.4s
+; CHECK-SD-NEXT: xtn v0.4h, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fptos_v3f32_v3i16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: fcvtzs v0.4s, v1.4s
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[2], w9
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
entry:
%c = fptosi <3 x float> %a to <3 x i16>
ret <3 x i16> %c
}
define <3 x i16> @fptou_v3f32_v3i16(<3 x float> %a) {
-; CHECK-LABEL: fptou_v3f32_v3i16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzu v0.4s, v0.4s
-; CHECK-NEXT: xtn v0.4h, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fptou_v3f32_v3i16:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fcvtzu v0.4s, v0.4s
+; CHECK-SD-NEXT: xtn v0.4h, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fptou_v3f32_v3i16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: fcvtzu v0.4s, v1.4s
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[2], w9
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
entry:
%c = fptoui <3 x float> %a to <3 x i16>
ret <3 x i16> %c
@@ -3414,7 +3520,10 @@ define <3 x i8> @fptos_v3f32_v3i8(<3 x float> %a) {
;
; CHECK-GI-LABEL: fptos_v3f32_v3i8:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fcvtzs v0.4s, v0.4s
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: fcvtzs v0.4s, v1.4s
; CHECK-GI-NEXT: mov s1, v0.s[1]
; CHECK-GI-NEXT: mov s2, v0.s[2]
; CHECK-GI-NEXT: fmov w0, s0
@@ -3438,7 +3547,10 @@ define <3 x i8> @fptou_v3f32_v3i8(<3 x float> %a) {
;
; CHECK-GI-LABEL: fptou_v3f32_v3i8:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fcvtzu v0.4s, v0.4s
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: fcvtzu v0.4s, v1.4s
; CHECK-GI-NEXT: mov s1, v0.s[1]
; CHECK-GI-NEXT: mov s2, v0.s[2]
; CHECK-GI-NEXT: fmov w0, s0
@@ -4056,7 +4168,11 @@ define <3 x i64> @fptos_v3f16_v3i64(<3 x half> %a) {
;
; CHECK-GI-NOFP16-LABEL: fptos_v3f16_v3i64:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v1.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v1.2d, v0.2s
; CHECK-GI-NOFP16-NEXT: fcvtl2 v2.2d, v0.4s
; CHECK-GI-NOFP16-NEXT: fcvtzs v0.2d, v1.2d
@@ -4120,7 +4236,11 @@ define <3 x i64> @fptou_v3f16_v3i64(<3 x half> %a) {
;
; CHECK-GI-NOFP16-LABEL: fptou_v3f16_v3i64:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v1.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v1.2d, v0.2s
; CHECK-GI-NOFP16-NEXT: fcvtl2 v2.2d, v0.4s
; CHECK-GI-NOFP16-NEXT: fcvtzu v0.2d, v1.2d
@@ -5729,22 +5849,48 @@ entry:
}
define <3 x i32> @fptos_v3f16_v3i32(<3 x half> %a) {
-; CHECK-LABEL: fptos_v3f16_v3i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtl v0.4s, v0.4h
-; CHECK-NEXT: fcvtzs v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fptos_v3f16_v3i32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-SD-NEXT: fcvtzs v0.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fptos_v3f16_v3i32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: fcvtl v0.4s, v1.4h
+; CHECK-GI-NEXT: fcvtzs v1.4s, v0.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = fptosi <3 x half> %a to <3 x i32>
ret <3 x i32> %c
}
define <3 x i32> @fptou_v3f16_v3i32(<3 x half> %a) {
-; CHECK-LABEL: fptou_v3f16_v3i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtl v0.4s, v0.4h
-; CHECK-NEXT: fcvtzu v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fptou_v3f16_v3i32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-SD-NEXT: fcvtzu v0.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fptou_v3f16_v3i32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: fcvtl v0.4s, v1.4h
+; CHECK-GI-NEXT: fcvtzu v1.4s, v0.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = fptoui <3 x half> %a to <3 x i32>
ret <3 x i32> %c
@@ -6027,14 +6173,37 @@ define <3 x i16> @fptos_v3f16_v3i16(<3 x half> %a) {
;
; CHECK-GI-NOFP16-LABEL: fptos_v3f16_v3i16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v1.4h
; CHECK-GI-NOFP16-NEXT: fcvtzs v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT: xtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9
+; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fptos_v3f16_v3i16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: fcvtzs v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT: fcvtzs v1.4h, v1.4h
+; CHECK-GI-FP16-NEXT: umov w8, v1.h[0]
+; CHECK-GI-FP16-NEXT: umov w9, v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.s[0], w8
+; CHECK-GI-FP16-NEXT: umov w8, v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v0.s[1], w9
+; CHECK-GI-FP16-NEXT: mov v0.s[2], w8
+; CHECK-GI-FP16-NEXT: mov w8, v0.s[1]
+; CHECK-GI-FP16-NEXT: mov w9, v0.s[2]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], w8
+; CHECK-GI-FP16-NEXT: mov v0.h[2], w9
+; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-FP16-NEXT: ret
entry:
%c = fptosi <3 x half> %a to <3 x i16>
@@ -6056,14 +6225,37 @@ define <3 x i16> @fptou_v3f16_v3i16(<3 x half> %a) {
;
; CHECK-GI-NOFP16-LABEL: fptou_v3f16_v3i16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v1.4h
; CHECK-GI-NOFP16-NEXT: fcvtzu v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT: xtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9
+; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fptou_v3f16_v3i16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: fcvtzu v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT: fcvtzu v1.4h, v1.4h
+; CHECK-GI-FP16-NEXT: umov w8, v1.h[0]
+; CHECK-GI-FP16-NEXT: umov w9, v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.s[0], w8
+; CHECK-GI-FP16-NEXT: umov w8, v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v0.s[1], w9
+; CHECK-GI-FP16-NEXT: mov v0.s[2], w8
+; CHECK-GI-FP16-NEXT: mov w8, v0.s[1]
+; CHECK-GI-FP16-NEXT: mov w9, v0.s[2]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], w8
+; CHECK-GI-FP16-NEXT: mov v0.h[2], w9
+; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-FP16-NEXT: ret
entry:
%c = fptoui <3 x half> %a to <3 x i16>
@@ -6493,7 +6685,11 @@ define <3 x i8> @fptos_v3f16_v3i8(<3 x half> %a) {
;
; CHECK-GI-NOFP16-LABEL: fptos_v3f16_v3i8:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v1.4h
; CHECK-GI-NOFP16-NEXT: fcvtzs v0.4s, v0.4s
; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1]
; CHECK-GI-NOFP16-NEXT: mov s2, v0.s[2]
@@ -6504,10 +6700,22 @@ define <3 x i8> @fptos_v3f16_v3i8(<3 x half> %a) {
;
; CHECK-GI-FP16-LABEL: fptos_v3f16_v3i8:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: fcvtzs v0.4h, v0.4h
-; CHECK-GI-FP16-NEXT: umov w0, v0.h[0]
-; CHECK-GI-FP16-NEXT: umov w1, v0.h[1]
-; CHECK-GI-FP16-NEXT: umov w2, v0.h[2]
+; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT: fcvtzs v0.4h, v1.4h
+; CHECK-GI-FP16-NEXT: umov w8, v0.h[0]
+; CHECK-GI-FP16-NEXT: umov w9, v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v1.s[0], w8
+; CHECK-GI-FP16-NEXT: umov w8, v0.h[2]
+; CHECK-GI-FP16-NEXT: mov v1.s[1], w9
+; CHECK-GI-FP16-NEXT: mov v1.s[2], w8
+; CHECK-GI-FP16-NEXT: mov s0, v1.s[1]
+; CHECK-GI-FP16-NEXT: mov s2, v1.s[2]
+; CHECK-GI-FP16-NEXT: fmov w0, s1
+; CHECK-GI-FP16-NEXT: fmov w1, s0
+; CHECK-GI-FP16-NEXT: fmov w2, s2
; CHECK-GI-FP16-NEXT: ret
entry:
%c = fptosi <3 x half> %a to <3 x i8>
@@ -6535,7 +6743,11 @@ define <3 x i8> @fptou_v3f16_v3i8(<3 x half> %a) {
;
; CHECK-GI-NOFP16-LABEL: fptou_v3f16_v3i8:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v1.4h
; CHECK-GI-NOFP16-NEXT: fcvtzu v0.4s, v0.4s
; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1]
; CHECK-GI-NOFP16-NEXT: mov s2, v0.s[2]
@@ -6546,10 +6758,22 @@ define <3 x i8> @fptou_v3f16_v3i8(<3 x half> %a) {
;
; CHECK-GI-FP16-LABEL: fptou_v3f16_v3i8:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: fcvtzu v0.4h, v0.4h
-; CHECK-GI-FP16-NEXT: umov w0, v0.h[0]
-; CHECK-GI-FP16-NEXT: umov w1, v0.h[1]
-; CHECK-GI-FP16-NEXT: umov w2, v0.h[2]
+; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT: fcvtzu v0.4h, v1.4h
+; CHECK-GI-FP16-NEXT: umov w8, v0.h[0]
+; CHECK-GI-FP16-NEXT: umov w9, v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v1.s[0], w8
+; CHECK-GI-FP16-NEXT: umov w8, v0.h[2]
+; CHECK-GI-FP16-NEXT: mov v1.s[1], w9
+; CHECK-GI-FP16-NEXT: mov v1.s[2], w8
+; CHECK-GI-FP16-NEXT: mov s0, v1.s[1]
+; CHECK-GI-FP16-NEXT: mov s2, v1.s[2]
+; CHECK-GI-FP16-NEXT: fmov w0, s1
+; CHECK-GI-FP16-NEXT: fmov w1, s0
+; CHECK-GI-FP16-NEXT: fmov w2, s2
; CHECK-GI-FP16-NEXT: ret
entry:
%c = fptoui <3 x half> %a to <3 x i8>
@@ -7323,11 +7547,14 @@ define <3 x i64> @fptos_v3f128_v3i64(<3 x fp128> %a) {
; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov x20, x0
; CHECK-GI-NEXT: bl __fixtfdi
-; CHECK-GI-NEXT: fmov d0, x19
-; CHECK-GI-NEXT: fmov d1, x20
+; CHECK-GI-NEXT: mov v0.d[0], x19
+; CHECK-GI-NEXT: mov v2.d[0], x0
; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT: mov v0.d[1], x20
; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NEXT: fmov d2, x0
+; CHECK-GI-NEXT: mov d1, v0.d[1]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
entry:
@@ -7380,11 +7607,14 @@ define <3 x i64> @fptou_v3f128_v3i64(<3 x fp128> %a) {
; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov x20, x0
; CHECK-GI-NEXT: bl __fixunstfdi
-; CHECK-GI-NEXT: fmov d0, x19
-; CHECK-GI-NEXT: fmov d1, x20
+; CHECK-GI-NEXT: mov v0.d[0], x19
+; CHECK-GI-NEXT: mov v2.d[0], x0
; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT: mov v0.d[1], x20
; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NEXT: fmov d2, x0
+; CHECK-GI-NEXT: mov d1, v0.d[1]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
entry:
@@ -7519,11 +7749,14 @@ define <3 x i32> @fptos_v3f128_v3i32(<3 x fp128> %a) {
; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov w20, w0
; CHECK-GI-NEXT: bl __fixtfsi
-; CHECK-GI-NEXT: mov v0.s[0], w19
+; CHECK-GI-NEXT: mov v1.s[0], w19
; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-GI-NEXT: mov v0.s[1], w20
+; CHECK-GI-NEXT: mov v1.s[1], w20
; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v0.s[2], w0
+; CHECK-GI-NEXT: mov v1.s[2], w0
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
entry:
@@ -7572,11 +7805,14 @@ define <3 x i32> @fptou_v3f128_v3i32(<3 x fp128> %a) {
; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov w20, w0
; CHECK-GI-NEXT: bl __fixunstfsi
-; CHECK-GI-NEXT: mov v0.s[0], w19
+; CHECK-GI-NEXT: mov v1.s[0], w19
; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-GI-NEXT: mov v0.s[1], w20
+; CHECK-GI-NEXT: mov v1.s[1], w20
; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v0.s[2], w0
+; CHECK-GI-NEXT: mov v1.s[2], w0
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
entry:
@@ -7714,11 +7950,15 @@ define <3 x i16> @fptos_v3f128_v3i16(<3 x fp128> %a) {
; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov w20, w0
; CHECK-GI-NEXT: bl __fixtfsi
-; CHECK-GI-NEXT: fmov s0, w19
+; CHECK-GI-NEXT: mov v0.s[0], w19
; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-GI-NEXT: mov v0.h[1], w20
+; CHECK-GI-NEXT: mov v0.s[1], w20
; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v0.h[2], w0
+; CHECK-GI-NEXT: mov v0.s[2], w0
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[2], w9
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
@@ -7771,11 +8011,15 @@ define <3 x i16> @fptou_v3f128_v3i16(<3 x fp128> %a) {
; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov w20, w0
; CHECK-GI-NEXT: bl __fixunstfsi
-; CHECK-GI-NEXT: fmov s0, w19
+; CHECK-GI-NEXT: mov v0.s[0], w19
; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-GI-NEXT: mov v0.h[1], w20
+; CHECK-GI-NEXT: mov v0.s[1], w20
; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v0.h[2], w0
+; CHECK-GI-NEXT: mov v0.s[2], w0
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[2], w9
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
@@ -7917,11 +8161,16 @@ define <3 x i8> @fptos_v3f128_v3i8(<3 x fp128> %a) {
; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov w20, w0
; CHECK-GI-NEXT: bl __fixtfsi
-; CHECK-GI-NEXT: mov w2, w0
-; CHECK-GI-NEXT: mov w0, w19
-; CHECK-GI-NEXT: mov w1, w20
-; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v0.s[0], w19
; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v0.s[1], w20
+; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v0.s[2], w0
+; CHECK-GI-NEXT: mov s1, v0.s[1]
+; CHECK-GI-NEXT: mov s2, v0.s[2]
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: fmov w1, s1
+; CHECK-GI-NEXT: fmov w2, s2
; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
entry:
@@ -7976,11 +8225,16 @@ define <3 x i8> @fptou_v3f128_v3i8(<3 x fp128> %a) {
; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov w20, w0
; CHECK-GI-NEXT: bl __fixunstfsi
-; CHECK-GI-NEXT: mov w2, w0
-; CHECK-GI-NEXT: mov w0, w19
-; CHECK-GI-NEXT: mov w1, w20
-; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v0.s[0], w19
; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v0.s[1], w20
+; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v0.s[2], w0
+; CHECK-GI-NEXT: mov s1, v0.s[1]
+; CHECK-GI-NEXT: mov s2, v0.s[2]
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: fmov w1, s1
+; CHECK-GI-NEXT: fmov w2, s2
; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index 9ef6d61c350ecf..a7c51ea2b9ace1 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -48,10 +48,21 @@ define <2 x i32> @test_signed_v2f32_v2i32(<2 x float> %f) {
}
define <3 x i32> @test_signed_v3f32_v3i32(<3 x float> %f) {
-; CHECK-LABEL: test_signed_v3f32_v3i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: fcvtzs v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_signed_v3f32_v3i32:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: fcvtzs v0.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_signed_v3f32_v3i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: fcvtzs v1.4s, v1.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
%x = call <3 x i32> @llvm.fptosi.sat.v3f32.v3i32(<3 x float> %f)
ret <3 x i32> %x
}
@@ -320,7 +331,10 @@ define <3 x i32> @test_signed_v3f64_v3i32(<3 x double> %f) {
; CHECK-GI-NEXT: cmgt v3.2d, v0.2d, v2.2d
; CHECK-GI-NEXT: bif v1.16b, v2.16b, v4.16b
; CHECK-GI-NEXT: bif v0.16b, v2.16b, v3.16b
-; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: uzp1 v1.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: ret
%x = call <3 x i32> @llvm.fptosi.sat.v3f64.v3i32(<3 x double> %f)
ret <3 x i32> %x
@@ -383,36 +397,35 @@ define <5 x i32> @test_signed_v5f64_v5i32(<5 x double> %f) {
; CHECK-GI-NEXT: // kill: def $d4 killed $d4 def $q4
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: mov v2.d[1], v3.d[0]
-; CHECK-GI-NEXT: fcvtzs v3.2d, v4.2d
+; CHECK-GI-NEXT: fcvtzs v4.2d, v4.2d
; CHECK-GI-NEXT: fcvtzs v0.2d, v0.2d
; CHECK-GI-NEXT: fcvtzs v1.2d, v2.2d
; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI12_1]
; CHECK-GI-NEXT: adrp x8, .LCPI12_0
-; CHECK-GI-NEXT: cmgt v4.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT: cmgt v3.2d, v2.2d, v0.2d
; CHECK-GI-NEXT: cmgt v5.2d, v2.2d, v1.2d
-; CHECK-GI-NEXT: bif v0.16b, v2.16b, v4.16b
+; CHECK-GI-NEXT: bif v0.16b, v2.16b, v3.16b
; CHECK-GI-NEXT: bif v1.16b, v2.16b, v5.16b
+; CHECK-GI-NEXT: cmgt v5.2d, v2.2d, v4.2d
+; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI12_0]
+; CHECK-GI-NEXT: bit v2.16b, v4.16b, v5.16b
+; CHECK-GI-NEXT: cmgt v6.2d, v0.2d, v3.2d
+; CHECK-GI-NEXT: cmgt v7.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT: bif v0.16b, v3.16b, v6.16b
+; CHECK-GI-NEXT: bif v1.16b, v3.16b, v7.16b
; CHECK-GI-NEXT: cmgt v4.2d, v2.2d, v3.2d
-; CHECK-GI-NEXT: ldr q5, [x8, :lo12:.LCPI12_0]
-; CHECK-GI-NEXT: bit v2.16b, v3.16b, v4.16b
-; CHECK-GI-NEXT: cmgt v3.2d, v0.2d, v5.2d
-; CHECK-GI-NEXT: cmgt v4.2d, v1.2d, v5.2d
-; CHECK-GI-NEXT: bif v0.16b, v5.16b, v3.16b
-; CHECK-GI-NEXT: bif v1.16b, v5.16b, v4.16b
-; CHECK-GI-NEXT: cmgt v3.2d, v2.2d, v5.2d
-; CHECK-GI-NEXT: bif v2.16b, v5.16b, v3.16b
-; CHECK-GI-NEXT: mov d3, v0.d[1]
-; CHECK-GI-NEXT: mov d4, v1.d[1]
-; CHECK-GI-NEXT: fmov x0, d0
-; CHECK-GI-NEXT: fmov x2, d1
-; CHECK-GI-NEXT: // kill: def $w0 killed $w0 killed $x0
-; CHECK-GI-NEXT: // kill: def $w2 killed $w2 killed $x2
-; CHECK-GI-NEXT: fmov x4, d2
-; CHECK-GI-NEXT: fmov x1, d3
-; CHECK-GI-NEXT: fmov x3, d4
-; CHECK-GI-NEXT: // kill: def $w4 killed $w4 killed $x4
-; CHECK-GI-NEXT: // kill: def $w1 killed $w1 killed $x1
-; CHECK-GI-NEXT: // kill: def $w3 killed $w3 killed $x3
+; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: mov v1.16b, v4.16b
+; CHECK-GI-NEXT: bsl v1.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT: mov s2, v0.s[1]
+; CHECK-GI-NEXT: mov s3, v0.s[2]
+; CHECK-GI-NEXT: mov s4, v0.s[3]
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: mov v1.s[0], v1.s[0]
+; CHECK-GI-NEXT: fmov w1, s2
+; CHECK-GI-NEXT: fmov w2, s3
+; CHECK-GI-NEXT: fmov w3, s4
+; CHECK-GI-NEXT: fmov w4, s1
; CHECK-GI-NEXT: ret
%x = call <5 x i32> @llvm.fptosi.sat.v5f64.v5i32(<5 x double> %f)
ret <5 x i32> %x
@@ -431,49 +444,49 @@ define <6 x i32> @test_signed_v6f64_v6i32(<6 x double> %f) {
;
; CHECK-GI-LABEL: test_signed_v6f64_v6i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: // kill: def $d4 killed $d4 def $q4
+; CHECK-GI-NEXT: // kill: def $d5 killed $d5 def $q5
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3
-; CHECK-GI-NEXT: // kill: def $d5 killed $d5 def $q5
; CHECK-GI-NEXT: adrp x8, .LCPI13_1
+; CHECK-GI-NEXT: mov v4.d[1], v5.d[0]
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: mov v2.d[1], v3.d[0]
-; CHECK-GI-NEXT: mov v4.d[1], v5.d[0]
; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI13_1]
; CHECK-GI-NEXT: adrp x8, .LCPI13_0
+; CHECK-GI-NEXT: ldr q6, [x8, :lo12:.LCPI13_0]
+; CHECK-GI-NEXT: fcvtzs v1.2d, v4.2d
; CHECK-GI-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-GI-NEXT: fcvtzs v1.2d, v2.2d
-; CHECK-GI-NEXT: fcvtzs v2.2d, v4.2d
+; CHECK-GI-NEXT: fcvtzs v2.2d, v2.2d
+; CHECK-GI-NEXT: cmgt v4.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT: cmgt v5.2d, v3.2d, v2.2d
+; CHECK-GI-NEXT: bif v1.16b, v3.16b, v4.16b
; CHECK-GI-NEXT: cmgt v4.2d, v3.2d, v0.2d
-; CHECK-GI-NEXT: cmgt v5.2d, v3.2d, v1.2d
-; CHECK-GI-NEXT: cmgt v6.2d, v3.2d, v2.2d
-; CHECK-GI-NEXT: bif v0.16b, v3.16b, v4.16b
-; CHECK-GI-NEXT: bif v1.16b, v3.16b, v5.16b
-; CHECK-GI-NEXT: bif v2.16b, v3.16b, v6.16b
-; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI13_0]
-; CHECK-GI-NEXT: cmgt v4.2d, v0.2d, v3.2d
-; CHECK-GI-NEXT: cmgt v5.2d, v1.2d, v3.2d
-; CHECK-GI-NEXT: cmgt v6.2d, v2.2d, v3.2d
+; CHECK-GI-NEXT: bif v2.16b, v3.16b, v5.16b
; CHECK-GI-NEXT: bif v0.16b, v3.16b, v4.16b
-; CHECK-GI-NEXT: bif v1.16b, v3.16b, v5.16b
-; CHECK-GI-NEXT: bif v2.16b, v3.16b, v6.16b
-; CHECK-GI-NEXT: mov d3, v0.d[1]
-; CHECK-GI-NEXT: mov d4, v1.d[1]
-; CHECK-GI-NEXT: mov d5, v2.d[1]
-; CHECK-GI-NEXT: fmov x0, d0
-; CHECK-GI-NEXT: fmov x2, d1
-; CHECK-GI-NEXT: fmov x4, d2
-; CHECK-GI-NEXT: // kill: def $w0 killed $w0 killed $x0
-; CHECK-GI-NEXT: // kill: def $w2 killed $w2 killed $x2
-; CHECK-GI-NEXT: // kill: def $w4 killed $w4 killed $x4
-; CHECK-GI-NEXT: fmov x1, d3
-; CHECK-GI-NEXT: fmov x3, d4
-; CHECK-GI-NEXT: fmov x5, d5
-; CHECK-GI-NEXT: // kill: def $w1 killed $w1 killed $x1
-; CHECK-GI-NEXT: // kill: def $w3 killed $w3 killed $x3
-; CHECK-GI-NEXT: // kill: def $w5 killed $w5 killed $x5
+; CHECK-GI-NEXT: cmgt v3.2d, v1.2d, v6.2d
+; CHECK-GI-NEXT: cmgt v4.2d, v2.2d, v6.2d
+; CHECK-GI-NEXT: bif v1.16b, v6.16b, v3.16b
+; CHECK-GI-NEXT: cmgt v3.2d, v0.2d, v6.2d
+; CHECK-GI-NEXT: bif v2.16b, v6.16b, v4.16b
+; CHECK-GI-NEXT: bif v0.16b, v6.16b, v3.16b
+; CHECK-GI-NEXT: mov d3, v1.d[1]
+; CHECK-GI-NEXT: mov v1.s[0], v1.s[0]
+; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT: fmov x8, d3
+; CHECK-GI-NEXT: mov v1.s[1], w8
+; CHECK-GI-NEXT: mov s2, v0.s[1]
+; CHECK-GI-NEXT: mov s3, v0.s[2]
+; CHECK-GI-NEXT: mov s4, v0.s[3]
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: mov s5, v1.s[1]
+; CHECK-GI-NEXT: fmov w1, s2
+; CHECK-GI-NEXT: fmov w2, s3
+; CHECK-GI-NEXT: fmov w3, s4
+; CHECK-GI-NEXT: fmov w4, s1
+; CHECK-GI-NEXT: fmov w5, s5
; CHECK-GI-NEXT: ret
%x = call <6 x i32> @llvm.fptosi.sat.v6f64.v6i32(<6 x double> %f)
ret <6 x i32> %x
@@ -902,14 +915,17 @@ define <3 x i32> @test_signed_v3f128_v3i32(<3 x fp128> %f) {
; CHECK-GI-NEXT: mov w19, w0
; CHECK-GI-NEXT: mov v1.16b, v0.16b
; CHECK-GI-NEXT: bl __unordtf2
-; CHECK-GI-NEXT: mov v0.s[0], w21
+; CHECK-GI-NEXT: mov v1.s[0], w21
; CHECK-GI-NEXT: cmp w0, #0
; CHECK-GI-NEXT: csel w8, wzr, w19, ne
; CHECK-GI-NEXT: ldp x20, x19, [sp, #112] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp x22, x21, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v0.s[1], w23
+; CHECK-GI-NEXT: mov v1.s[1], w23
; CHECK-GI-NEXT: ldp x30, x23, [sp, #80] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: mov v1.s[2], w8
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: add sp, sp, #128
; CHECK-GI-NEXT: ret
%x = call <3 x i32> @llvm.fptosi.sat.v3f128.v3i32(<3 x fp128> %f)
@@ -1221,11 +1237,24 @@ define <2 x i32> @test_signed_v2f16_v2i32(<2 x half> %f) {
}
define <3 x i32> @test_signed_v3f16_v3i32(<3 x half> %f) {
-; CHECK-LABEL: test_signed_v3f16_v3i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: fcvtl v0.4s, v0.4h
-; CHECK-NEXT: fcvtzs v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_signed_v3f16_v3i32:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-SD-NEXT: fcvtzs v0.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_signed_v3f16_v3i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: fcvtl v0.4s, v1.4h
+; CHECK-GI-NEXT: fcvtzs v1.4s, v0.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
%x = call <3 x i32> @llvm.fptosi.sat.v3f16.v3i32(<3 x half> %f)
ret <3 x i32> %x
}
@@ -1256,18 +1285,22 @@ define <5 x i32> @test_signed_v5f16_v5i32(<5 x half> %f) {
;
; CHECK-GI-LABEL: test_signed_v5f16_v5i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: fcvtl v1.4s, v0.4h
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v1.h[3], v0.h[3]
; CHECK-GI-NEXT: mov v0.h[0], v0.h[4]
-; CHECK-GI-NEXT: fcvtzs v1.4s, v1.4s
+; CHECK-GI-NEXT: fcvtl v1.4s, v1.4h
; CHECK-GI-NEXT: fcvtl v0.4s, v0.4h
-; CHECK-GI-NEXT: mov s2, v1.s[1]
+; CHECK-GI-NEXT: fcvtzs v1.4s, v1.4s
; CHECK-GI-NEXT: fcvtzs v0.4s, v0.4s
+; CHECK-GI-NEXT: mov s2, v1.s[1]
; CHECK-GI-NEXT: mov s3, v1.s[2]
; CHECK-GI-NEXT: mov s4, v1.s[3]
; CHECK-GI-NEXT: fmov w0, s1
+; CHECK-GI-NEXT: fmov w4, s0
; CHECK-GI-NEXT: fmov w1, s2
; CHECK-GI-NEXT: fmov w2, s3
-; CHECK-GI-NEXT: fmov w4, s0
; CHECK-GI-NEXT: fmov w3, s4
; CHECK-GI-NEXT: ret
%x = call <5 x i32> @llvm.fptosi.sat.v5f16.v5i32(<5 x half> %f)
@@ -1291,22 +1324,26 @@ define <6 x i32> @test_signed_v6f16_v6i32(<6 x half> %f) {
;
; CHECK-GI-LABEL: test_signed_v6f16_v6i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov v1.h[0], v0.h[4]
-; CHECK-GI-NEXT: mov v1.h[1], v0.h[5]
-; CHECK-GI-NEXT: fcvtl v0.4s, v0.4h
-; CHECK-GI-NEXT: fcvtl v1.4s, v1.4h
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v2.h[0], v0.h[4]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v2.h[1], v0.h[5]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT: fcvtl v0.4s, v1.4h
+; CHECK-GI-NEXT: fcvtl v1.4s, v2.4h
; CHECK-GI-NEXT: fcvtzs v0.4s, v0.4s
; CHECK-GI-NEXT: fcvtzs v1.4s, v1.4s
; CHECK-GI-NEXT: mov s2, v0.s[1]
; CHECK-GI-NEXT: mov s3, v0.s[2]
-; CHECK-GI-NEXT: mov s4, v0.s[3]
+; CHECK-GI-NEXT: mov s4, v1.s[1]
+; CHECK-GI-NEXT: mov s5, v0.s[3]
; CHECK-GI-NEXT: fmov w0, s0
-; CHECK-GI-NEXT: mov s5, v1.s[1]
+; CHECK-GI-NEXT: fmov w4, s1
; CHECK-GI-NEXT: fmov w1, s2
; CHECK-GI-NEXT: fmov w2, s3
-; CHECK-GI-NEXT: fmov w3, s4
-; CHECK-GI-NEXT: fmov w4, s1
-; CHECK-GI-NEXT: fmov w5, s5
+; CHECK-GI-NEXT: fmov w5, s4
+; CHECK-GI-NEXT: fmov w3, s5
; CHECK-GI-NEXT: ret
%x = call <6 x i32> @llvm.fptosi.sat.v6f16.v6i32(<6 x half> %f)
ret <6 x i32> %x
@@ -1330,23 +1367,27 @@ define <7 x i32> @test_signed_v7f16_v7i32(<7 x half> %f) {
;
; CHECK-GI-LABEL: test_signed_v7f16_v7i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov v1.h[0], v0.h[4]
-; CHECK-GI-NEXT: mov v1.h[1], v0.h[5]
-; CHECK-GI-NEXT: mov v1.h[2], v0.h[6]
-; CHECK-GI-NEXT: fcvtl v0.4s, v0.4h
-; CHECK-GI-NEXT: fcvtl v1.4s, v1.4h
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v2.h[0], v0.h[4]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v2.h[1], v0.h[5]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v2.h[2], v0.h[6]
+; CHECK-GI-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT: fcvtl v0.4s, v1.4h
+; CHECK-GI-NEXT: fcvtl v1.4s, v2.4h
; CHECK-GI-NEXT: fcvtzs v0.4s, v0.4s
; CHECK-GI-NEXT: fcvtzs v1.4s, v1.4s
; CHECK-GI-NEXT: mov s2, v0.s[1]
; CHECK-GI-NEXT: mov s3, v0.s[2]
; CHECK-GI-NEXT: mov s4, v0.s[3]
-; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: mov s5, v1.s[1]
; CHECK-GI-NEXT: mov s6, v1.s[2]
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: fmov w4, s1
; CHECK-GI-NEXT: fmov w1, s2
; CHECK-GI-NEXT: fmov w2, s3
; CHECK-GI-NEXT: fmov w3, s4
-; CHECK-GI-NEXT: fmov w4, s1
; CHECK-GI-NEXT: fmov w5, s5
; CHECK-GI-NEXT: fmov w6, s6
; CHECK-GI-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index e1670ad2dc053b..eb68125080f33a 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -48,10 +48,21 @@ define <2 x i32> @test_unsigned_v2f32_v2i32(<2 x float> %f) {
}
define <3 x i32> @test_unsigned_v3f32_v3i32(<3 x float> %f) {
-; CHECK-LABEL: test_unsigned_v3f32_v3i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: fcvtzu v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_unsigned_v3f32_v3i32:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: fcvtzu v0.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_unsigned_v3f32_v3i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: fcvtzu v1.4s, v1.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
%x = call <3 x i32> @llvm.fptoui.sat.v3f32.v3i32(<3 x float> %f)
ret <3 x i32> %x
}
@@ -308,7 +319,10 @@ define <3 x i32> @test_unsigned_v3f64_v3i32(<3 x double> %f) {
; CHECK-GI-NEXT: bif v1.16b, v2.16b, v4.16b
; CHECK-GI-NEXT: cmhi v3.2d, v2.2d, v0.2d
; CHECK-GI-NEXT: bif v0.16b, v2.16b, v3.16b
-; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: uzp1 v1.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: ret
%x = call <3 x i32> @llvm.fptoui.sat.v3f64.v3i32(<3 x double> %f)
ret <3 x i32> %x
@@ -364,27 +378,25 @@ define <5 x i32> @test_unsigned_v5f64_v5i32(<5 x double> %f) {
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: mov v2.d[1], v3.d[0]
; CHECK-GI-NEXT: movi v1.2d, #0x000000ffffffff
-; CHECK-GI-NEXT: fcvtzu v3.2d, v4.2d
+; CHECK-GI-NEXT: fcvtzu v4.2d, v4.2d
; CHECK-GI-NEXT: fcvtzu v0.2d, v0.2d
; CHECK-GI-NEXT: fcvtzu v2.2d, v2.2d
-; CHECK-GI-NEXT: cmhi v4.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT: cmhi v3.2d, v1.2d, v0.2d
; CHECK-GI-NEXT: cmhi v5.2d, v1.2d, v2.2d
-; CHECK-GI-NEXT: bif v0.16b, v1.16b, v4.16b
+; CHECK-GI-NEXT: bif v0.16b, v1.16b, v3.16b
; CHECK-GI-NEXT: bif v2.16b, v1.16b, v5.16b
-; CHECK-GI-NEXT: cmhi v4.2d, v1.2d, v3.2d
-; CHECK-GI-NEXT: bit v1.16b, v3.16b, v4.16b
-; CHECK-GI-NEXT: mov d3, v0.d[1]
-; CHECK-GI-NEXT: mov d4, v2.d[1]
-; CHECK-GI-NEXT: fmov x0, d0
-; CHECK-GI-NEXT: fmov x2, d2
-; CHECK-GI-NEXT: // kill: def $w0 killed $w0 killed $x0
-; CHECK-GI-NEXT: // kill: def $w2 killed $w2 killed $x2
-; CHECK-GI-NEXT: fmov x4, d1
-; CHECK-GI-NEXT: fmov x1, d3
-; CHECK-GI-NEXT: fmov x3, d4
-; CHECK-GI-NEXT: // kill: def $w4 killed $w4 killed $x4
-; CHECK-GI-NEXT: // kill: def $w1 killed $w1 killed $x1
-; CHECK-GI-NEXT: // kill: def $w3 killed $w3 killed $x3
+; CHECK-GI-NEXT: cmhi v3.2d, v1.2d, v4.2d
+; CHECK-GI-NEXT: bit v1.16b, v4.16b, v3.16b
+; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT: mov v1.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov s2, v0.s[1]
+; CHECK-GI-NEXT: mov s3, v0.s[2]
+; CHECK-GI-NEXT: mov s4, v0.s[3]
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: fmov w4, s1
+; CHECK-GI-NEXT: fmov w1, s2
+; CHECK-GI-NEXT: fmov w2, s3
+; CHECK-GI-NEXT: fmov w3, s4
; CHECK-GI-NEXT: ret
%x = call <5 x i32> @llvm.fptoui.sat.v5f64.v5i32(<5 x double> %f)
ret <5 x i32> %x
@@ -403,40 +415,40 @@ define <6 x i32> @test_unsigned_v6f64_v6i32(<6 x double> %f) {
;
; CHECK-GI-LABEL: test_unsigned_v6f64_v6i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: // kill: def $d4 killed $d4 def $q4
+; CHECK-GI-NEXT: // kill: def $d5 killed $d5 def $q5
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3
-; CHECK-GI-NEXT: // kill: def $d5 killed $d5 def $q5
+; CHECK-GI-NEXT: mov v4.d[1], v5.d[0]
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: mov v2.d[1], v3.d[0]
-; CHECK-GI-NEXT: mov v4.d[1], v5.d[0]
-; CHECK-GI-NEXT: movi v1.2d, #0x000000ffffffff
+; CHECK-GI-NEXT: movi v3.2d, #0x000000ffffffff
+; CHECK-GI-NEXT: fcvtzu v1.2d, v4.2d
; CHECK-GI-NEXT: fcvtzu v0.2d, v0.2d
; CHECK-GI-NEXT: fcvtzu v2.2d, v2.2d
-; CHECK-GI-NEXT: fcvtzu v3.2d, v4.2d
-; CHECK-GI-NEXT: cmhi v4.2d, v1.2d, v0.2d
-; CHECK-GI-NEXT: cmhi v5.2d, v1.2d, v2.2d
-; CHECK-GI-NEXT: cmhi v6.2d, v1.2d, v3.2d
-; CHECK-GI-NEXT: bif v0.16b, v1.16b, v4.16b
-; CHECK-GI-NEXT: bif v2.16b, v1.16b, v5.16b
-; CHECK-GI-NEXT: bit v1.16b, v3.16b, v6.16b
-; CHECK-GI-NEXT: mov d3, v0.d[1]
-; CHECK-GI-NEXT: mov d4, v2.d[1]
-; CHECK-GI-NEXT: mov d5, v1.d[1]
-; CHECK-GI-NEXT: fmov x0, d0
-; CHECK-GI-NEXT: fmov x2, d2
-; CHECK-GI-NEXT: fmov x4, d1
-; CHECK-GI-NEXT: // kill: def $w0 killed $w0 killed $x0
-; CHECK-GI-NEXT: // kill: def $w2 killed $w2 killed $x2
-; CHECK-GI-NEXT: // kill: def $w4 killed $w4 killed $x4
-; CHECK-GI-NEXT: fmov x1, d3
-; CHECK-GI-NEXT: fmov x3, d4
-; CHECK-GI-NEXT: fmov x5, d5
-; CHECK-GI-NEXT: // kill: def $w1 killed $w1 killed $x1
-; CHECK-GI-NEXT: // kill: def $w3 killed $w3 killed $x3
-; CHECK-GI-NEXT: // kill: def $w5 killed $w5 killed $x5
+; CHECK-GI-NEXT: cmhi v4.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT: cmhi v5.2d, v3.2d, v2.2d
+; CHECK-GI-NEXT: bif v1.16b, v3.16b, v4.16b
+; CHECK-GI-NEXT: cmhi v4.2d, v3.2d, v0.2d
+; CHECK-GI-NEXT: bif v2.16b, v3.16b, v5.16b
+; CHECK-GI-NEXT: bif v0.16b, v3.16b, v4.16b
+; CHECK-GI-NEXT: mov d3, v1.d[1]
+; CHECK-GI-NEXT: mov v1.s[0], v1.s[0]
+; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT: fmov x8, d3
+; CHECK-GI-NEXT: mov v1.s[1], w8
+; CHECK-GI-NEXT: mov s2, v0.s[1]
+; CHECK-GI-NEXT: mov s3, v0.s[2]
+; CHECK-GI-NEXT: mov s4, v0.s[3]
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: mov s5, v1.s[1]
+; CHECK-GI-NEXT: fmov w1, s2
+; CHECK-GI-NEXT: fmov w2, s3
+; CHECK-GI-NEXT: fmov w3, s4
+; CHECK-GI-NEXT: fmov w4, s1
+; CHECK-GI-NEXT: fmov w5, s5
; CHECK-GI-NEXT: ret
%x = call <6 x i32> @llvm.fptoui.sat.v6f64.v6i32(<6 x double> %f)
ret <6 x i32> %x
@@ -781,12 +793,15 @@ define <3 x i32> @test_unsigned_v3f128_v3i32(<3 x fp128> %f) {
; CHECK-GI-NEXT: csel x8, x23, x21, gt
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: bl __fixunstfsi
-; CHECK-GI-NEXT: mov v0.s[0], w19
+; CHECK-GI-NEXT: mov v1.s[0], w19
; CHECK-GI-NEXT: ldp x22, x21, [sp, #80] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp x30, x23, [sp, #64] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v0.s[1], w20
+; CHECK-GI-NEXT: mov v1.s[1], w20
; CHECK-GI-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v0.s[2], w0
+; CHECK-GI-NEXT: mov v1.s[2], w0
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: add sp, sp, #112
; CHECK-GI-NEXT: ret
%x = call <3 x i32> @llvm.fptoui.sat.v3f128.v3i32(<3 x fp128> %f)
@@ -1052,11 +1067,24 @@ define <2 x i32> @test_unsigned_v2f16_v2i32(<2 x half> %f) {
}
define <3 x i32> @test_unsigned_v3f16_v3i32(<3 x half> %f) {
-; CHECK-LABEL: test_unsigned_v3f16_v3i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: fcvtl v0.4s, v0.4h
-; CHECK-NEXT: fcvtzu v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_unsigned_v3f16_v3i32:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-SD-NEXT: fcvtzu v0.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_unsigned_v3f16_v3i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: fcvtl v0.4s, v1.4h
+; CHECK-GI-NEXT: fcvtzu v1.4s, v0.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
%x = call <3 x i32> @llvm.fptoui.sat.v3f16.v3i32(<3 x half> %f)
ret <3 x i32> %x
}
@@ -1087,18 +1115,22 @@ define <5 x i32> @test_unsigned_v5f16_v5i32(<5 x half> %f) {
;
; CHECK-GI-LABEL: test_unsigned_v5f16_v5i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: fcvtl v1.4s, v0.4h
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v1.h[3], v0.h[3]
; CHECK-GI-NEXT: mov v0.h[0], v0.h[4]
-; CHECK-GI-NEXT: fcvtzu v1.4s, v1.4s
+; CHECK-GI-NEXT: fcvtl v1.4s, v1.4h
; CHECK-GI-NEXT: fcvtl v0.4s, v0.4h
-; CHECK-GI-NEXT: mov s2, v1.s[1]
+; CHECK-GI-NEXT: fcvtzu v1.4s, v1.4s
; CHECK-GI-NEXT: fcvtzu v0.4s, v0.4s
+; CHECK-GI-NEXT: mov s2, v1.s[1]
; CHECK-GI-NEXT: mov s3, v1.s[2]
; CHECK-GI-NEXT: mov s4, v1.s[3]
; CHECK-GI-NEXT: fmov w0, s1
+; CHECK-GI-NEXT: fmov w4, s0
; CHECK-GI-NEXT: fmov w1, s2
; CHECK-GI-NEXT: fmov w2, s3
-; CHECK-GI-NEXT: fmov w4, s0
; CHECK-GI-NEXT: fmov w3, s4
; CHECK-GI-NEXT: ret
%x = call <5 x i32> @llvm.fptoui.sat.v5f16.v5i32(<5 x half> %f)
@@ -1122,22 +1154,26 @@ define <6 x i32> @test_unsigned_v6f16_v6i32(<6 x half> %f) {
;
; CHECK-GI-LABEL: test_unsigned_v6f16_v6i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov v1.h[0], v0.h[4]
-; CHECK-GI-NEXT: mov v1.h[1], v0.h[5]
-; CHECK-GI-NEXT: fcvtl v0.4s, v0.4h
-; CHECK-GI-NEXT: fcvtl v1.4s, v1.4h
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v2.h[0], v0.h[4]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v2.h[1], v0.h[5]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT: fcvtl v0.4s, v1.4h
+; CHECK-GI-NEXT: fcvtl v1.4s, v2.4h
; CHECK-GI-NEXT: fcvtzu v0.4s, v0.4s
; CHECK-GI-NEXT: fcvtzu v1.4s, v1.4s
; CHECK-GI-NEXT: mov s2, v0.s[1]
; CHECK-GI-NEXT: mov s3, v0.s[2]
-; CHECK-GI-NEXT: mov s4, v0.s[3]
+; CHECK-GI-NEXT: mov s4, v1.s[1]
+; CHECK-GI-NEXT: mov s5, v0.s[3]
; CHECK-GI-NEXT: fmov w0, s0
-; CHECK-GI-NEXT: mov s5, v1.s[1]
+; CHECK-GI-NEXT: fmov w4, s1
; CHECK-GI-NEXT: fmov w1, s2
; CHECK-GI-NEXT: fmov w2, s3
-; CHECK-GI-NEXT: fmov w3, s4
-; CHECK-GI-NEXT: fmov w4, s1
-; CHECK-GI-NEXT: fmov w5, s5
+; CHECK-GI-NEXT: fmov w5, s4
+; CHECK-GI-NEXT: fmov w3, s5
; CHECK-GI-NEXT: ret
%x = call <6 x i32> @llvm.fptoui.sat.v6f16.v6i32(<6 x half> %f)
ret <6 x i32> %x
@@ -1161,23 +1197,27 @@ define <7 x i32> @test_unsigned_v7f16_v7i32(<7 x half> %f) {
;
; CHECK-GI-LABEL: test_unsigned_v7f16_v7i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov v1.h[0], v0.h[4]
-; CHECK-GI-NEXT: mov v1.h[1], v0.h[5]
-; CHECK-GI-NEXT: mov v1.h[2], v0.h[6]
-; CHECK-GI-NEXT: fcvtl v0.4s, v0.4h
-; CHECK-GI-NEXT: fcvtl v1.4s, v1.4h
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v2.h[0], v0.h[4]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v2.h[1], v0.h[5]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v2.h[2], v0.h[6]
+; CHECK-GI-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT: fcvtl v0.4s, v1.4h
+; CHECK-GI-NEXT: fcvtl v1.4s, v2.4h
; CHECK-GI-NEXT: fcvtzu v0.4s, v0.4s
; CHECK-GI-NEXT: fcvtzu v1.4s, v1.4s
; CHECK-GI-NEXT: mov s2, v0.s[1]
; CHECK-GI-NEXT: mov s3, v0.s[2]
; CHECK-GI-NEXT: mov s4, v0.s[3]
-; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: mov s5, v1.s[1]
; CHECK-GI-NEXT: mov s6, v1.s[2]
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: fmov w4, s1
; CHECK-GI-NEXT: fmov w1, s2
; CHECK-GI-NEXT: fmov w2, s3
; CHECK-GI-NEXT: fmov w3, s4
-; CHECK-GI-NEXT: fmov w4, s1
; CHECK-GI-NEXT: fmov w5, s5
; CHECK-GI-NEXT: fmov w6, s6
; CHECK-GI-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll
index 2187717c4148ae..89ac7dbe42487d 100644
--- a/llvm/test/CodeGen/AArch64/fptrunc.ll
+++ b/llvm/test/CodeGen/AArch64/fptrunc.ll
@@ -130,9 +130,12 @@ define <2 x half> @fptrunc_v2f128_v2f16(<2 x fp128> %a) {
; CHECK-GI-NEXT: bl __trunctfhf2
; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT: bl __trunctfhf2
-; CHECK-GI-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload
; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[0]
+; CHECK-GI-NEXT: mov h0, v1.h[1]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[0]
+; CHECK-GI-NEXT: mov v0.16b, v1.16b
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
@@ -261,10 +264,13 @@ define <3 x float> @fptrunc_v3f64_v3f32(<3 x double> %a) {
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: fcvt s2, d2
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT: fcvtn v1.2s, v0.2d
+; CHECK-GI-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v2.s[0]
; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
-; CHECK-GI-NEXT: mov v0.s[2], v2.s[0]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: ret
entry:
%c = fptrunc <3 x double> %a to <3 x float>
@@ -295,6 +301,8 @@ define <2 x half> @fptrunc_v2f64_v2f16(<2 x double> %a) {
; CHECK-GI-NEXT: fcvt h0, d0
; CHECK-GI-NEXT: fcvt h1, d1
; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT: mov h1, v0.h[1]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
@@ -318,8 +326,16 @@ define <3 x half> @fptrunc_v3f64_v3f16(<3 x double> %a) {
; CHECK-GI-NEXT: fcvt h0, d0
; CHECK-GI-NEXT: fcvt h1, d1
; CHECK-GI-NEXT: fcvt h2, d2
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[2], v2.h[0]
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov v0.s[1], w8
+; CHECK-GI-NEXT: fmov w8, s2
+; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[2], w9
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
@@ -366,6 +382,9 @@ define <2 x half> @fptrunc_v2f32_v2f16(<2 x float> %a) {
; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
; CHECK-GI-NEXT: fcvtn v0.4h, v1.4s
+; CHECK-GI-NEXT: mov h1, v0.h[1]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
%c = fptrunc <2 x float> %a to <2 x half>
@@ -373,10 +392,29 @@ entry:
}
define <3 x half> @fptrunc_v3f32_v3f16(<3 x float> %a) {
-; CHECK-LABEL: fptrunc_v3f32_v3f16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fptrunc_v3f32_v3f16:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fptrunc_v3f32_v3f16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-GI-NEXT: umov w8, v1.h[0]
+; CHECK-GI-NEXT: umov w9, v1.h[1]
+; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: umov w8, v1.h[2]
+; CHECK-GI-NEXT: mov v0.s[1], w9
+; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[2], w9
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
entry:
%c = fptrunc <3 x float> %a to <3 x half>
ret <3 x half> %c
diff --git a/llvm/test/CodeGen/AArch64/frem.ll b/llvm/test/CodeGen/AArch64/frem.ll
index feb13da64cbf8a..ad8576c63b1aea 100644
--- a/llvm/test/CodeGen/AArch64/frem.ll
+++ b/llvm/test/CodeGen/AArch64/frem.ll
@@ -157,38 +157,42 @@ define <3 x double> @frem_v3f64(<3 x double> %a, <3 x double> %b) {
;
; CHECK-GI-LABEL: frem_v3f64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: str d12, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-GI-NEXT: stp d11, d10, [sp, #8] // 16-byte Folded Spill
-; CHECK-GI-NEXT: stp d9, d8, [sp, #24] // 16-byte Folded Spill
-; CHECK-GI-NEXT: str x30, [sp, #40] // 8-byte Folded Spill
-; CHECK-GI-NEXT: .cfi_def_cfa_offset 48
-; CHECK-GI-NEXT: .cfi_offset w30, -8
-; CHECK-GI-NEXT: .cfi_offset b8, -16
-; CHECK-GI-NEXT: .cfi_offset b9, -24
-; CHECK-GI-NEXT: .cfi_offset b10, -32
-; CHECK-GI-NEXT: .cfi_offset b11, -40
-; CHECK-GI-NEXT: .cfi_offset b12, -48
+; CHECK-GI-NEXT: sub sp, sp, #80
+; CHECK-GI-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 80
+; CHECK-GI-NEXT: .cfi_offset w30, -16
+; CHECK-GI-NEXT: .cfi_offset b8, -24
+; CHECK-GI-NEXT: .cfi_offset b9, -32
+; CHECK-GI-NEXT: .cfi_offset b10, -40
+; CHECK-GI-NEXT: .cfi_offset b11, -48
; CHECK-GI-NEXT: fmov d8, d1
; CHECK-GI-NEXT: fmov d1, d3
; CHECK-GI-NEXT: fmov d9, d2
; CHECK-GI-NEXT: fmov d10, d4
; CHECK-GI-NEXT: fmov d11, d5
; CHECK-GI-NEXT: bl fmod
-; CHECK-GI-NEXT: fmov d12, d0
-; CHECK-GI-NEXT: fmov d0, d8
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov d1, d10
+; CHECK-GI-NEXT: fmov d0, d8
; CHECK-GI-NEXT: bl fmod
-; CHECK-GI-NEXT: fmov d8, d0
-; CHECK-GI-NEXT: fmov d0, d9
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov d1, d11
+; CHECK-GI-NEXT: fmov d0, d9
; CHECK-GI-NEXT: bl fmod
-; CHECK-GI-NEXT: fmov d1, d8
-; CHECK-GI-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldp d11, d10, [sp, #8] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldp q1, q3, [sp] // 32-byte Folded Reload
; CHECK-GI-NEXT: fmov d2, d0
-; CHECK-GI-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload
-; CHECK-GI-NEXT: fmov d0, d12
-; CHECK-GI-NEXT: ldr d12, [sp], #48 // 8-byte Folded Reload
+; CHECK-GI-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-GI-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT: mov d1, v3.d[1]
+; CHECK-GI-NEXT: fmov d0, d3
+; CHECK-GI-NEXT: add sp, sp, #80
; CHECK-GI-NEXT: ret
entry:
%c = frem <3 x double> %a, %b
@@ -420,7 +424,9 @@ define <3 x float> @frem_v3f32(<3 x float> %a, <3 x float> %b) {
; CHECK-GI-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.s[1], v2.s[0]
; CHECK-GI-NEXT: mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: add sp, sp, #80
; CHECK-GI-NEXT: ret
entry:
@@ -880,7 +886,13 @@ define <7 x half> @frem_v7f16(<7 x half> %a, <7 x half> %b) {
; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-NEXT: add sp, sp, #176
; CHECK-GI-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/fsincos.ll b/llvm/test/CodeGen/AArch64/fsincos.ll
index 2afc56a7139fbf..eac17ec72bc990 100644
--- a/llvm/test/CodeGen/AArch64/fsincos.ll
+++ b/llvm/test/CodeGen/AArch64/fsincos.ll
@@ -138,29 +138,33 @@ define <3 x double> @sin_v3f64(<3 x double> %a) {
;
; CHECK-GI-LABEL: sin_v3f64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-GI-NEXT: stp d9, d8, [sp, #8] // 16-byte Folded Spill
-; CHECK-GI-NEXT: str x30, [sp, #24] // 8-byte Folded Spill
-; CHECK-GI-NEXT: .cfi_def_cfa_offset 32
-; CHECK-GI-NEXT: .cfi_offset w30, -8
-; CHECK-GI-NEXT: .cfi_offset b8, -16
-; CHECK-GI-NEXT: .cfi_offset b9, -24
-; CHECK-GI-NEXT: .cfi_offset b10, -32
+; CHECK-GI-NEXT: sub sp, sp, #64
+; CHECK-GI-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT: .cfi_offset w30, -16
+; CHECK-GI-NEXT: .cfi_offset b8, -24
+; CHECK-GI-NEXT: .cfi_offset b9, -32
; CHECK-GI-NEXT: fmov d8, d1
; CHECK-GI-NEXT: fmov d9, d2
; CHECK-GI-NEXT: bl sin
-; CHECK-GI-NEXT: fmov d10, d0
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov d0, d8
; CHECK-GI-NEXT: bl sin
-; CHECK-GI-NEXT: fmov d8, d0
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov d0, d9
; CHECK-GI-NEXT: bl sin
-; CHECK-GI-NEXT: fmov d1, d8
-; CHECK-GI-NEXT: ldp d9, d8, [sp, #8] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload
+; CHECK-GI-NEXT: ldp q1, q3, [sp] // 32-byte Folded Reload
; CHECK-GI-NEXT: fmov d2, d0
-; CHECK-GI-NEXT: fmov d0, d10
-; CHECK-GI-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT: mov d1, v3.d[1]
+; CHECK-GI-NEXT: fmov d0, d3
+; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
entry:
%c = call <3 x double> @llvm.sin.v3f64(<3 x double> %a)
@@ -354,7 +358,9 @@ define <3 x float> @sin_v3f32(<3 x float> %a) {
; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.s[1], v2.s[0]
; CHECK-GI-NEXT: mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
entry:
@@ -725,7 +731,13 @@ define <7 x half> @sin_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-NEXT: add sp, sp, #160
; CHECK-GI-NEXT: ret
entry:
@@ -1440,29 +1452,33 @@ define <3 x double> @cos_v3f64(<3 x double> %a) {
;
; CHECK-GI-LABEL: cos_v3f64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-GI-NEXT: stp d9, d8, [sp, #8] // 16-byte Folded Spill
-; CHECK-GI-NEXT: str x30, [sp, #24] // 8-byte Folded Spill
-; CHECK-GI-NEXT: .cfi_def_cfa_offset 32
-; CHECK-GI-NEXT: .cfi_offset w30, -8
-; CHECK-GI-NEXT: .cfi_offset b8, -16
-; CHECK-GI-NEXT: .cfi_offset b9, -24
-; CHECK-GI-NEXT: .cfi_offset b10, -32
+; CHECK-GI-NEXT: sub sp, sp, #64
+; CHECK-GI-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT: .cfi_offset w30, -16
+; CHECK-GI-NEXT: .cfi_offset b8, -24
+; CHECK-GI-NEXT: .cfi_offset b9, -32
; CHECK-GI-NEXT: fmov d8, d1
; CHECK-GI-NEXT: fmov d9, d2
; CHECK-GI-NEXT: bl cos
-; CHECK-GI-NEXT: fmov d10, d0
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov d0, d8
; CHECK-GI-NEXT: bl cos
-; CHECK-GI-NEXT: fmov d8, d0
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov d0, d9
; CHECK-GI-NEXT: bl cos
-; CHECK-GI-NEXT: fmov d1, d8
-; CHECK-GI-NEXT: ldp d9, d8, [sp, #8] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload
+; CHECK-GI-NEXT: ldp q1, q3, [sp] // 32-byte Folded Reload
; CHECK-GI-NEXT: fmov d2, d0
-; CHECK-GI-NEXT: fmov d0, d10
-; CHECK-GI-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT: mov d1, v3.d[1]
+; CHECK-GI-NEXT: fmov d0, d3
+; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
entry:
%c = call <3 x double> @llvm.cos.v3f64(<3 x double> %a)
@@ -1656,7 +1672,9 @@ define <3 x float> @cos_v3f32(<3 x float> %a) {
; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.s[1], v2.s[0]
; CHECK-GI-NEXT: mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
entry:
@@ -2027,7 +2045,13 @@ define <7 x half> @cos_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-NEXT: add sp, sp, #160
; CHECK-GI-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/fsqrt.ll b/llvm/test/CodeGen/AArch64/fsqrt.ll
index 6c5fd8e52b017c..15e93e244f1d5c 100644
--- a/llvm/test/CodeGen/AArch64/fsqrt.ll
+++ b/llvm/test/CodeGen/AArch64/fsqrt.ll
@@ -84,6 +84,7 @@ define <3 x double> @sqrt_v3f64(<3 x double> %a) {
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: fsqrt d2, d2
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: fsqrt v0.2d, v0.2d
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -115,10 +116,21 @@ entry:
}
define <3 x float> @sqrt_v3f32(<3 x float> %a) {
-; CHECK-LABEL: sqrt_v3f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fsqrt v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: sqrt_v3f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fsqrt v0.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: sqrt_v3f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: fsqrt v1.4s, v1.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = call <3 x float> @llvm.sqrt.v3f32(<3 x float> %a)
ret <3 x float> %c
@@ -195,27 +207,52 @@ define <7 x half> @sqrt_v7f16(<7 x half> %a) {
;
; CHECK-GI-NOFP16-LABEL: sqrt_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0]
; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: fsqrt v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1]
; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2]
; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v0.h[3]
; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT: fsqrt v0.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT: fsqrt v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: fsqrt v2.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[4], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[5], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[6], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[6]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: sqrt_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: fsqrt v0.8h, v0.8h
+; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-FP16-NEXT: mov v1.h[4], v0.h[4]
+; CHECK-GI-FP16-NEXT: mov v1.h[5], v0.h[5]
+; CHECK-GI-FP16-NEXT: mov v1.h[6], v0.h[6]
+; CHECK-GI-FP16-NEXT: fsqrt v1.8h, v1.8h
+; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-FP16-NEXT: ret
entry:
%c = call <7 x half> @llvm.sqrt.v7f16(<7 x half> %a)
diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll
index 61964060ca2c8b..9a49266ace1d9b 100644
--- a/llvm/test/CodeGen/AArch64/icmp.ll
+++ b/llvm/test/CodeGen/AArch64/icmp.ll
@@ -1155,28 +1155,29 @@ define <3 x i64> @v3i64_i64(<3 x i64> %a, <3 x i64> %b, <3 x i64> %d, <3 x i64>
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3
; CHECK-GI-NEXT: // kill: def $d4 killed $d4 def $q4
-; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: // kill: def $d6 killed $d6 def $q6
-; CHECK-GI-NEXT: // kill: def $d5 killed $d5 def $q5
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: // kill: def $d7 killed $d7 def $q7
+; CHECK-GI-NEXT: // kill: def $d5 killed $d5 def $q5
; CHECK-GI-NEXT: ldr x8, [sp]
; CHECK-GI-NEXT: ldr x10, [sp, #24]
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: mov v3.d[1], v4.d[0]
-; CHECK-GI-NEXT: cmgt v2.2d, v5.2d, v2.2d
-; CHECK-GI-NEXT: ldp d1, d4, [sp, #8]
; CHECK-GI-NEXT: mov v6.d[1], v7.d[0]
-; CHECK-GI-NEXT: fmov x9, d2
+; CHECK-GI-NEXT: ldp d1, d4, [sp, #8]
+; CHECK-GI-NEXT: cmgt v2.2d, v5.2d, v2.2d
; CHECK-GI-NEXT: mov v1.d[1], v4.d[0]
; CHECK-GI-NEXT: cmgt v0.2d, v3.2d, v0.2d
+; CHECK-GI-NEXT: fmov x9, d2
; CHECK-GI-NEXT: sbfx x9, x9, #0, #1
; CHECK-GI-NEXT: bsl v0.16b, v6.16b, v1.16b
; CHECK-GI-NEXT: and x8, x8, x9
; CHECK-GI-NEXT: bic x9, x10, x9
; CHECK-GI-NEXT: orr x8, x8, x9
-; CHECK-GI-NEXT: fmov d2, x8
+; CHECK-GI-NEXT: mov v2.d[0], x8
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: ret
entry:
%c = icmp slt <3 x i64> %a, %b
@@ -1227,22 +1228,37 @@ define <3 x i32> @v3i32_i32(<3 x i32> %a, <3 x i32> %b, <3 x i32> %d, <3 x i32>
;
; CHECK-GI-LABEL: v3i32_i32:
; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v4.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v5.s[0], v1.s[0]
; CHECK-GI-NEXT: mov w8, #31 // =0x1f
+; CHECK-GI-NEXT: mov v6.s[0], w8
; CHECK-GI-NEXT: mov w9, #-1 // =0xffffffff
-; CHECK-GI-NEXT: cmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NEXT: mov v4.s[0], w8
-; CHECK-GI-NEXT: mov v5.s[0], w9
-; CHECK-GI-NEXT: mov v4.s[1], w8
-; CHECK-GI-NEXT: mov v5.s[1], w9
-; CHECK-GI-NEXT: mov v4.s[2], w8
-; CHECK-GI-NEXT: mov v5.s[2], w9
-; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT: neg v1.4s, v4.4s
-; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT: eor v1.16b, v0.16b, v5.16b
-; CHECK-GI-NEXT: and v0.16b, v2.16b, v0.16b
-; CHECK-GI-NEXT: and v1.16b, v3.16b, v1.16b
-; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v4.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v5.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v6.s[1], w8
+; CHECK-GI-NEXT: mov v4.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v5.s[2], v1.s[2]
+; CHECK-GI-NEXT: mov v0.s[0], w9
+; CHECK-GI-NEXT: mov v6.s[2], w8
+; CHECK-GI-NEXT: cmgt v1.4s, v5.4s, v4.4s
+; CHECK-GI-NEXT: mov v4.s[0], v2.s[0]
+; CHECK-GI-NEXT: mov v5.s[0], v3.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], w9
+; CHECK-GI-NEXT: ushl v1.4s, v1.4s, v6.4s
+; CHECK-GI-NEXT: neg v6.4s, v6.4s
+; CHECK-GI-NEXT: mov v4.s[1], v2.s[1]
+; CHECK-GI-NEXT: mov v5.s[1], v3.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], w9
+; CHECK-GI-NEXT: sshl v1.4s, v1.4s, v6.4s
+; CHECK-GI-NEXT: mov v4.s[2], v2.s[2]
+; CHECK-GI-NEXT: mov v5.s[2], v3.s[2]
+; CHECK-GI-NEXT: eor v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT: and v1.16b, v4.16b, v1.16b
+; CHECK-GI-NEXT: and v0.16b, v5.16b, v0.16b
+; CHECK-GI-NEXT: orr v1.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: ret
entry:
%c = icmp slt <3 x i32> %a, %b
diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll
index 54ee693db1239f..c67d3b4ee9f410 100644
--- a/llvm/test/CodeGen/AArch64/insertextract.ll
+++ b/llvm/test/CodeGen/AArch64/insertextract.ll
@@ -299,12 +299,18 @@ define <3 x float> @insert_v3f32_c(<3 x float> %a, float %b, i32 %c) {
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: sub sp, sp, #16
; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
; CHECK-GI-NEXT: mov w9, w0
; CHECK-GI-NEXT: mov x8, sp
-; CHECK-GI-NEXT: str q0, [sp]
; CHECK-GI-NEXT: and x9, x9, #0x3
+; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT: str q2, [sp]
; CHECK-GI-NEXT: str s1, [x8, x9, lsl #2]
-; CHECK-GI-NEXT: ldr q0, [sp], #16
+; CHECK-GI-NEXT: ldr q1, [sp], #16
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: ret
entry:
%d = insertelement <3 x float> %a, float %b, i32 %c
@@ -1019,12 +1025,18 @@ define <3 x i32> @insert_v3i32_c(<3 x i32> %a, i32 %b, i32 %c) {
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: sub sp, sp, #16
; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
; CHECK-GI-NEXT: mov w9, w1
; CHECK-GI-NEXT: mov x8, sp
-; CHECK-GI-NEXT: str q0, [sp]
; CHECK-GI-NEXT: and x9, x9, #0x3
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: str q1, [sp]
; CHECK-GI-NEXT: str w0, [x8, x9, lsl #2]
-; CHECK-GI-NEXT: ldr q0, [sp], #16
+; CHECK-GI-NEXT: ldr q1, [sp], #16
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: ret
entry:
%d = insertelement <3 x i32> %a, i32 %b, i32 %c
@@ -1578,10 +1590,13 @@ define float @extract_v3f32_c(<3 x float> %a, i32 %c) {
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: sub sp, sp, #16
; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
; CHECK-GI-NEXT: mov w9, w0
; CHECK-GI-NEXT: mov x8, sp
-; CHECK-GI-NEXT: str q0, [sp]
; CHECK-GI-NEXT: and x9, x9, #0x3
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: str q1, [sp]
; CHECK-GI-NEXT: ldr s0, [x8, x9, lsl #2]
; CHECK-GI-NEXT: add sp, sp, #16
; CHECK-GI-NEXT: ret
@@ -2272,10 +2287,13 @@ define i32 @extract_v3i32_c(<3 x i32> %a, i32 %c) {
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: sub sp, sp, #16
; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
; CHECK-GI-NEXT: mov w9, w0
; CHECK-GI-NEXT: mov x8, sp
-; CHECK-GI-NEXT: str q0, [sp]
; CHECK-GI-NEXT: and x9, x9, #0x3
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: str q1, [sp]
; CHECK-GI-NEXT: ldr w0, [x8, x9, lsl #2]
; CHECK-GI-NEXT: add sp, sp, #16
; CHECK-GI-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index 81c1a64f2d434f..caff8c527d34a5 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -1345,18 +1345,16 @@ define <3 x double> @stofp_v3i128_v3f64(<3 x i128> %a) {
;
; CHECK-GI-LABEL: stofp_v3i128_v3f64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: stp d9, d8, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-GI-NEXT: str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-GI-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT: .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT: sub sp, sp, #80
+; CHECK-GI-NEXT: str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-GI-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 80
; CHECK-GI-NEXT: .cfi_offset w19, -8
; CHECK-GI-NEXT: .cfi_offset w20, -16
; CHECK-GI-NEXT: .cfi_offset w21, -24
; CHECK-GI-NEXT: .cfi_offset w22, -32
; CHECK-GI-NEXT: .cfi_offset w30, -48
-; CHECK-GI-NEXT: .cfi_offset b8, -56
-; CHECK-GI-NEXT: .cfi_offset b9, -64
; CHECK-GI-NEXT: mov x19, x2
; CHECK-GI-NEXT: mov x20, x3
; CHECK-GI-NEXT: mov x21, x4
@@ -1364,19 +1362,24 @@ define <3 x double> @stofp_v3i128_v3f64(<3 x i128> %a) {
; CHECK-GI-NEXT: bl __floattidf
; CHECK-GI-NEXT: mov x0, x19
; CHECK-GI-NEXT: mov x1, x20
-; CHECK-GI-NEXT: fmov d8, d0
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT: bl __floattidf
; CHECK-GI-NEXT: mov x0, x21
; CHECK-GI-NEXT: mov x1, x22
-; CHECK-GI-NEXT: fmov d9, d0
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: bl __floattidf
-; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldp q1, q3, [sp] // 32-byte Folded Reload
; CHECK-GI-NEXT: fmov d2, d0
-; CHECK-GI-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-NEXT: fmov d0, d8
-; CHECK-GI-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-NEXT: fmov d1, d9
-; CHECK-GI-NEXT: ldp d9, d8, [sp], #64 // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT: mov d1, v3.d[1]
+; CHECK-GI-NEXT: fmov d0, d3
+; CHECK-GI-NEXT: add sp, sp, #80
; CHECK-GI-NEXT: ret
entry:
%c = sitofp <3 x i128> %a to <3 x double>
@@ -1422,18 +1425,16 @@ define <3 x double> @utofp_v3i128_v3f64(<3 x i128> %a) {
;
; CHECK-GI-LABEL: utofp_v3i128_v3f64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: stp d9, d8, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-GI-NEXT: str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-GI-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT: .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT: sub sp, sp, #80
+; CHECK-GI-NEXT: str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-GI-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 80
; CHECK-GI-NEXT: .cfi_offset w19, -8
; CHECK-GI-NEXT: .cfi_offset w20, -16
; CHECK-GI-NEXT: .cfi_offset w21, -24
; CHECK-GI-NEXT: .cfi_offset w22, -32
; CHECK-GI-NEXT: .cfi_offset w30, -48
-; CHECK-GI-NEXT: .cfi_offset b8, -56
-; CHECK-GI-NEXT: .cfi_offset b9, -64
; CHECK-GI-NEXT: mov x19, x2
; CHECK-GI-NEXT: mov x20, x3
; CHECK-GI-NEXT: mov x21, x4
@@ -1441,19 +1442,24 @@ define <3 x double> @utofp_v3i128_v3f64(<3 x i128> %a) {
; CHECK-GI-NEXT: bl __floatuntidf
; CHECK-GI-NEXT: mov x0, x19
; CHECK-GI-NEXT: mov x1, x20
-; CHECK-GI-NEXT: fmov d8, d0
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT: bl __floatuntidf
; CHECK-GI-NEXT: mov x0, x21
; CHECK-GI-NEXT: mov x1, x22
-; CHECK-GI-NEXT: fmov d9, d0
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: bl __floatuntidf
-; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldp q1, q3, [sp] // 32-byte Folded Reload
; CHECK-GI-NEXT: fmov d2, d0
-; CHECK-GI-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-NEXT: fmov d0, d8
-; CHECK-GI-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-NEXT: fmov d1, d9
-; CHECK-GI-NEXT: ldp d9, d8, [sp], #64 // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT: mov d1, v3.d[1]
+; CHECK-GI-NEXT: fmov d0, d3
+; CHECK-GI-NEXT: add sp, sp, #80
; CHECK-GI-NEXT: ret
entry:
%c = uitofp <3 x i128> %a to <3 x double>
@@ -2009,13 +2015,16 @@ define <3 x double> @stofp_v3i32_v3f64(<3 x i32> %a) {
;
; CHECK-GI-LABEL: stofp_v3i32_v3f64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: sshll v1.2d, v0.2s, #0
-; CHECK-GI-NEXT: sshll2 v0.2d, v0.4s, #0
-; CHECK-GI-NEXT: scvtf v3.2d, v1.2d
-; CHECK-GI-NEXT: scvtf v2.2d, v0.2d
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v2.s[0], v0.s[2]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: sshll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT: sshll v1.2d, v2.2s, #0
+; CHECK-GI-NEXT: scvtf v0.2d, v0.2d
+; CHECK-GI-NEXT: scvtf v2.2d, v1.2d
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
-; CHECK-GI-NEXT: mov d1, v3.d[1]
-; CHECK-GI-NEXT: fmov d0, d3
+; CHECK-GI-NEXT: mov d1, v0.d[1]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
%c = sitofp <3 x i32> %a to <3 x double>
@@ -2037,13 +2046,16 @@ define <3 x double> @utofp_v3i32_v3f64(<3 x i32> %a) {
;
; CHECK-GI-LABEL: utofp_v3i32_v3f64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ushll v1.2d, v0.2s, #0
-; CHECK-GI-NEXT: ushll2 v0.2d, v0.4s, #0
-; CHECK-GI-NEXT: ucvtf v3.2d, v1.2d
-; CHECK-GI-NEXT: ucvtf v2.2d, v0.2d
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v2.s[0], v0.s[2]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: ushll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT: ushll v1.2d, v2.2s, #0
+; CHECK-GI-NEXT: ucvtf v0.2d, v0.2d
+; CHECK-GI-NEXT: ucvtf v2.2d, v1.2d
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
-; CHECK-GI-NEXT: mov d1, v3.d[1]
-; CHECK-GI-NEXT: fmov d0, d3
+; CHECK-GI-NEXT: mov d1, v0.d[1]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
%c = uitofp <3 x i32> %a to <3 x double>
@@ -2596,7 +2608,11 @@ define <3 x double> @stofp_v3i16_v3f64(<3 x i16> %a) {
;
; CHECK-GI-LABEL: stofp_v3i16_v3f64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-GI-NEXT: sshll v0.2d, v1.2s, #0
; CHECK-GI-NEXT: sshll2 v1.2d, v1.4s, #0
; CHECK-GI-NEXT: scvtf v0.2d, v0.2d
@@ -2626,7 +2642,11 @@ define <3 x double> @utofp_v3i16_v3f64(<3 x i16> %a) {
;
; CHECK-GI-LABEL: utofp_v3i16_v3f64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-NEXT: ushll v0.2d, v1.2s, #0
; CHECK-GI-NEXT: ushll2 v1.2d, v1.4s, #0
; CHECK-GI-NEXT: ucvtf v0.2d, v0.2d
@@ -4328,7 +4348,9 @@ define <3 x float> @stofp_v3i128_v3f32(<3 x i128> %a) {
; CHECK-GI-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.s[1], v2.s[0]
; CHECK-GI-NEXT: mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: add sp, sp, #80
; CHECK-GI-NEXT: ret
entry:
@@ -4412,7 +4434,9 @@ define <3 x float> @utofp_v3i128_v3f32(<3 x i128> %a) {
; CHECK-GI-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.s[1], v2.s[0]
; CHECK-GI-NEXT: mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: add sp, sp, #80
; CHECK-GI-NEXT: ret
entry:
@@ -4461,13 +4485,16 @@ define <3 x float> @stofp_v3i64_v3f32(<3 x i64> %a) {
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT: scvtf v2.2d, v2.2d
+; CHECK-GI-NEXT: scvtf v1.2d, v2.2d
; CHECK-GI-NEXT: scvtf v0.2d, v0.2d
-; CHECK-GI-NEXT: fcvtn v2.2s, v2.2d
-; CHECK-GI-NEXT: fcvtn v1.2s, v0.2d
-; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
-; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
-; CHECK-GI-NEXT: mov v0.s[2], v2.s[0]
+; CHECK-GI-NEXT: fcvtn v1.2s, v1.2d
+; CHECK-GI-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v2.s[2], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[0], v2.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v2.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v2.s[2]
; CHECK-GI-NEXT: ret
entry:
%c = sitofp <3 x i64> %a to <3 x float>
@@ -4493,13 +4520,16 @@ define <3 x float> @utofp_v3i64_v3f32(<3 x i64> %a) {
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT: ucvtf v2.2d, v2.2d
+; CHECK-GI-NEXT: ucvtf v1.2d, v2.2d
; CHECK-GI-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-GI-NEXT: fcvtn v2.2s, v2.2d
-; CHECK-GI-NEXT: fcvtn v1.2s, v0.2d
-; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
-; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
-; CHECK-GI-NEXT: mov v0.s[2], v2.s[0]
+; CHECK-GI-NEXT: fcvtn v1.2s, v1.2d
+; CHECK-GI-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v2.s[2], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[0], v2.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v2.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v2.s[2]
; CHECK-GI-NEXT: ret
entry:
%c = uitofp <3 x i64> %a to <3 x float>
@@ -4831,20 +4861,42 @@ entry:
}
define <3 x float> @stofp_v3i32_v3f32(<3 x i32> %a) {
-; CHECK-LABEL: stofp_v3i32_v3f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: scvtf v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: stofp_v3i32_v3f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: scvtf v0.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: stofp_v3i32_v3f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: scvtf v1.4s, v1.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = sitofp <3 x i32> %a to <3 x float>
ret <3 x float> %c
}
define <3 x float> @utofp_v3i32_v3f32(<3 x i32> %a) {
-; CHECK-LABEL: utofp_v3i32_v3f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ucvtf v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: utofp_v3i32_v3f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ucvtf v0.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: utofp_v3i32_v3f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: ucvtf v1.4s, v1.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = uitofp <3 x i32> %a to <3 x float>
ret <3 x float> %c
@@ -4977,22 +5029,48 @@ entry:
}
define <3 x float> @stofp_v3i16_v3f32(<3 x i16> %a) {
-; CHECK-LABEL: stofp_v3i16_v3f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sshll v0.4s, v0.4h, #0
-; CHECK-NEXT: scvtf v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: stofp_v3i16_v3f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: scvtf v0.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: stofp_v3i16_v3f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: sshll v0.4s, v1.4h, #0
+; CHECK-GI-NEXT: scvtf v1.4s, v0.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = sitofp <3 x i16> %a to <3 x float>
ret <3 x float> %c
}
define <3 x float> @utofp_v3i16_v3f32(<3 x i16> %a) {
-; CHECK-LABEL: utofp_v3i16_v3f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: ucvtf v0.4s, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: utofp_v3i16_v3f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: ucvtf v0.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: utofp_v3i16_v3f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: ushll v0.4s, v1.4h, #0
+; CHECK-GI-NEXT: ucvtf v1.4s, v0.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%c = uitofp <3 x i16> %a to <3 x float>
ret <3 x float> %c
@@ -5258,7 +5336,10 @@ define <3 x float> @stofp_v3i8_v3f32(<3 x i8> %a) {
; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT: scvtf v0.4s, v0.4s
+; CHECK-GI-NEXT: scvtf v1.4s, v0.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: ret
entry:
%c = sitofp <3 x i8> %a to <3 x float>
@@ -5288,7 +5369,10 @@ define <3 x float> @utofp_v3i8_v3f32(<3 x i8> %a) {
; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT: ucvtf v0.4s, v0.4s
+; CHECK-GI-NEXT: ucvtf v1.4s, v0.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: ret
entry:
%c = uitofp <3 x i8> %a to <3 x float>
@@ -5690,11 +5774,14 @@ define <2 x half> @stofp_v2i128_v2f16(<2 x i128> %a) {
; CHECK-GI-NOFP16-NEXT: mov x1, x20
; CHECK-GI-NOFP16-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NOFP16-NEXT: bl __floattisf
-; CHECK-GI-NOFP16-NEXT: fcvt h1, s0
-; CHECK-GI-NOFP16-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT: ldr q1, [sp] // 16-byte Folded Reload
; CHECK-GI-NOFP16-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
; CHECK-GI-NOFP16-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov h0, v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.16b, v1.16b
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NOFP16-NEXT: add sp, sp, #48
; CHECK-GI-NOFP16-NEXT: ret
@@ -5721,7 +5808,10 @@ define <2 x half> @stofp_v2i128_v2f16(<2 x i128> %a) {
; CHECK-GI-FP16-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-FP16-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[0]
-; CHECK-GI-FP16-NEXT: fmov d0, d1
+; CHECK-GI-FP16-NEXT: mov h0, v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[0]
+; CHECK-GI-FP16-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-FP16-NEXT: add sp, sp, #48
; CHECK-GI-FP16-NEXT: ret
entry:
@@ -5803,11 +5893,14 @@ define <2 x half> @utofp_v2i128_v2f16(<2 x i128> %a) {
; CHECK-GI-NOFP16-NEXT: mov x1, x20
; CHECK-GI-NOFP16-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NOFP16-NEXT: bl __floatuntisf
-; CHECK-GI-NOFP16-NEXT: fcvt h1, s0
-; CHECK-GI-NOFP16-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT: ldr q1, [sp] // 16-byte Folded Reload
; CHECK-GI-NOFP16-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
; CHECK-GI-NOFP16-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov h0, v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.16b, v1.16b
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NOFP16-NEXT: add sp, sp, #48
; CHECK-GI-NOFP16-NEXT: ret
@@ -5834,7 +5927,10 @@ define <2 x half> @utofp_v2i128_v2f16(<2 x i128> %a) {
; CHECK-GI-FP16-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-FP16-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[0]
-; CHECK-GI-FP16-NEXT: fmov d0, d1
+; CHECK-GI-FP16-NEXT: mov h0, v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[0]
+; CHECK-GI-FP16-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-FP16-NEXT: add sp, sp, #48
; CHECK-GI-FP16-NEXT: ret
entry:
@@ -5927,55 +6023,63 @@ define <3 x half> @stofp_v3i128_v3f16(<3 x i128> %a) {
;
; CHECK-GI-NOFP16-LABEL: stofp_v3i128_v3f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: sub sp, sp, #80
-; CHECK-GI-NOFP16-NEXT: str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-GI-NOFP16-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NOFP16-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NOFP16-NEXT: .cfi_def_cfa_offset 80
+; CHECK-GI-NOFP16-NEXT: stp d9, d8, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT: str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT: .cfi_def_cfa_offset 64
; CHECK-GI-NOFP16-NEXT: .cfi_offset w19, -8
; CHECK-GI-NOFP16-NEXT: .cfi_offset w20, -16
; CHECK-GI-NOFP16-NEXT: .cfi_offset w21, -24
; CHECK-GI-NOFP16-NEXT: .cfi_offset w22, -32
; CHECK-GI-NOFP16-NEXT: .cfi_offset w30, -48
+; CHECK-GI-NOFP16-NEXT: .cfi_offset b8, -56
+; CHECK-GI-NOFP16-NEXT: .cfi_offset b9, -64
; CHECK-GI-NOFP16-NEXT: mov x19, x2
; CHECK-GI-NOFP16-NEXT: mov x20, x3
; CHECK-GI-NOFP16-NEXT: mov x21, x4
; CHECK-GI-NOFP16-NEXT: mov x22, x5
; CHECK-GI-NOFP16-NEXT: bl __floattisf
-; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
; CHECK-GI-NOFP16-NEXT: mov x0, x19
; CHECK-GI-NOFP16-NEXT: mov x1, x20
-; CHECK-GI-NOFP16-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT: fcvt h8, s0
; CHECK-GI-NOFP16-NEXT: bl __floattisf
-; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
; CHECK-GI-NOFP16-NEXT: mov x0, x21
; CHECK-GI-NOFP16-NEXT: mov x1, x22
-; CHECK-GI-NOFP16-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT: fcvt h9, s0
; CHECK-GI-NOFP16-NEXT: bl __floattisf
-; CHECK-GI-NOFP16-NEXT: ldp q2, q1, [sp] // 32-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT: fmov w8, s8
; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
-; CHECK-GI-NOFP16-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.16b, v1.16b
-; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NOFP16-NEXT: add sp, sp, #80
+; CHECK-GI-NOFP16-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT: mov v1.s[0], w8
+; CHECK-GI-NOFP16-NEXT: fmov w8, s9
+; CHECK-GI-NOFP16-NEXT: mov v1.s[1], w8
+; CHECK-GI-NOFP16-NEXT: fmov w8, s0
+; CHECK-GI-NOFP16-NEXT: mov v1.s[2], w8
+; CHECK-GI-NOFP16-NEXT: mov w8, v1.s[1]
+; CHECK-GI-NOFP16-NEXT: mov w9, v1.s[2]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], w8
+; CHECK-GI-NOFP16-NEXT: mov v1.h[2], w9
+; CHECK-GI-NOFP16-NEXT: fmov d0, d1
+; CHECK-GI-NOFP16-NEXT: ldp d9, d8, [sp], #64 // 16-byte Folded Reload
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: stofp_v3i128_v3f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: sub sp, sp, #80
-; CHECK-GI-FP16-NEXT: str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-GI-FP16-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-FP16-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-FP16-NEXT: .cfi_def_cfa_offset 80
+; CHECK-GI-FP16-NEXT: stp d9, d8, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT: str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-FP16-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT: .cfi_def_cfa_offset 64
; CHECK-GI-FP16-NEXT: .cfi_offset w19, -8
; CHECK-GI-FP16-NEXT: .cfi_offset w20, -16
; CHECK-GI-FP16-NEXT: .cfi_offset w21, -24
; CHECK-GI-FP16-NEXT: .cfi_offset w22, -32
; CHECK-GI-FP16-NEXT: .cfi_offset w30, -48
+; CHECK-GI-FP16-NEXT: .cfi_offset b8, -56
+; CHECK-GI-FP16-NEXT: .cfi_offset b9, -64
; CHECK-GI-FP16-NEXT: mov x19, x2
; CHECK-GI-FP16-NEXT: mov x20, x3
; CHECK-GI-FP16-NEXT: mov x21, x4
@@ -5983,24 +6087,28 @@ define <3 x half> @stofp_v3i128_v3f16(<3 x i128> %a) {
; CHECK-GI-FP16-NEXT: bl __floattihf
; CHECK-GI-FP16-NEXT: mov x0, x19
; CHECK-GI-FP16-NEXT: mov x1, x20
-; CHECK-GI-FP16-NEXT: // kill: def $h0 killed $h0 def $q0
-; CHECK-GI-FP16-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT: fmov s8, s0
; CHECK-GI-FP16-NEXT: bl __floattihf
; CHECK-GI-FP16-NEXT: mov x0, x21
; CHECK-GI-FP16-NEXT: mov x1, x22
-; CHECK-GI-FP16-NEXT: // kill: def $h0 killed $h0 def $q0
-; CHECK-GI-FP16-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT: fmov s9, s0
; CHECK-GI-FP16-NEXT: bl __floattihf
-; CHECK-GI-FP16-NEXT: ldp q2, q1, [sp] // 32-byte Folded Reload
-; CHECK-GI-FP16-NEXT: // kill: def $h0 killed $h0 def $q0
-; CHECK-GI-FP16-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-GI-FP16-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-GI-FP16-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-FP16-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.16b, v1.16b
-; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-FP16-NEXT: add sp, sp, #80
+; CHECK-GI-FP16-NEXT: fmov w8, s8
+; CHECK-GI-FP16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-GI-FP16-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-FP16-NEXT: mov v1.s[0], w8
+; CHECK-GI-FP16-NEXT: fmov w8, s9
+; CHECK-GI-FP16-NEXT: mov v1.s[1], w8
+; CHECK-GI-FP16-NEXT: fmov w8, s0
+; CHECK-GI-FP16-NEXT: mov v1.s[2], w8
+; CHECK-GI-FP16-NEXT: mov w8, v1.s[1]
+; CHECK-GI-FP16-NEXT: mov w9, v1.s[2]
+; CHECK-GI-FP16-NEXT: mov v1.h[1], w8
+; CHECK-GI-FP16-NEXT: mov v1.h[2], w9
+; CHECK-GI-FP16-NEXT: fmov d0, d1
+; CHECK-GI-FP16-NEXT: ldp d9, d8, [sp], #64 // 16-byte Folded Reload
; CHECK-GI-FP16-NEXT: ret
entry:
%c = sitofp <3 x i128> %a to <3 x half>
@@ -6092,55 +6200,63 @@ define <3 x half> @utofp_v3i128_v3f16(<3 x i128> %a) {
;
; CHECK-GI-NOFP16-LABEL: utofp_v3i128_v3f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: sub sp, sp, #80
-; CHECK-GI-NOFP16-NEXT: str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-GI-NOFP16-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NOFP16-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NOFP16-NEXT: .cfi_def_cfa_offset 80
+; CHECK-GI-NOFP16-NEXT: stp d9, d8, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT: str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT: .cfi_def_cfa_offset 64
; CHECK-GI-NOFP16-NEXT: .cfi_offset w19, -8
; CHECK-GI-NOFP16-NEXT: .cfi_offset w20, -16
; CHECK-GI-NOFP16-NEXT: .cfi_offset w21, -24
; CHECK-GI-NOFP16-NEXT: .cfi_offset w22, -32
; CHECK-GI-NOFP16-NEXT: .cfi_offset w30, -48
+; CHECK-GI-NOFP16-NEXT: .cfi_offset b8, -56
+; CHECK-GI-NOFP16-NEXT: .cfi_offset b9, -64
; CHECK-GI-NOFP16-NEXT: mov x19, x2
; CHECK-GI-NOFP16-NEXT: mov x20, x3
; CHECK-GI-NOFP16-NEXT: mov x21, x4
; CHECK-GI-NOFP16-NEXT: mov x22, x5
; CHECK-GI-NOFP16-NEXT: bl __floatuntisf
-; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
; CHECK-GI-NOFP16-NEXT: mov x0, x19
; CHECK-GI-NOFP16-NEXT: mov x1, x20
-; CHECK-GI-NOFP16-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT: fcvt h8, s0
; CHECK-GI-NOFP16-NEXT: bl __floatuntisf
-; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
; CHECK-GI-NOFP16-NEXT: mov x0, x21
; CHECK-GI-NOFP16-NEXT: mov x1, x22
-; CHECK-GI-NOFP16-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT: fcvt h9, s0
; CHECK-GI-NOFP16-NEXT: bl __floatuntisf
-; CHECK-GI-NOFP16-NEXT: ldp q2, q1, [sp] // 32-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT: fmov w8, s8
; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
-; CHECK-GI-NOFP16-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.16b, v1.16b
-; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NOFP16-NEXT: add sp, sp, #80
+; CHECK-GI-NOFP16-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT: mov v1.s[0], w8
+; CHECK-GI-NOFP16-NEXT: fmov w8, s9
+; CHECK-GI-NOFP16-NEXT: mov v1.s[1], w8
+; CHECK-GI-NOFP16-NEXT: fmov w8, s0
+; CHECK-GI-NOFP16-NEXT: mov v1.s[2], w8
+; CHECK-GI-NOFP16-NEXT: mov w8, v1.s[1]
+; CHECK-GI-NOFP16-NEXT: mov w9, v1.s[2]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], w8
+; CHECK-GI-NOFP16-NEXT: mov v1.h[2], w9
+; CHECK-GI-NOFP16-NEXT: fmov d0, d1
+; CHECK-GI-NOFP16-NEXT: ldp d9, d8, [sp], #64 // 16-byte Folded Reload
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: utofp_v3i128_v3f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: sub sp, sp, #80
-; CHECK-GI-FP16-NEXT: str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-GI-FP16-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-FP16-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-FP16-NEXT: .cfi_def_cfa_offset 80
+; CHECK-GI-FP16-NEXT: stp d9, d8, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT: str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-FP16-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT: .cfi_def_cfa_offset 64
; CHECK-GI-FP16-NEXT: .cfi_offset w19, -8
; CHECK-GI-FP16-NEXT: .cfi_offset w20, -16
; CHECK-GI-FP16-NEXT: .cfi_offset w21, -24
; CHECK-GI-FP16-NEXT: .cfi_offset w22, -32
; CHECK-GI-FP16-NEXT: .cfi_offset w30, -48
+; CHECK-GI-FP16-NEXT: .cfi_offset b8, -56
+; CHECK-GI-FP16-NEXT: .cfi_offset b9, -64
; CHECK-GI-FP16-NEXT: mov x19, x2
; CHECK-GI-FP16-NEXT: mov x20, x3
; CHECK-GI-FP16-NEXT: mov x21, x4
@@ -6148,24 +6264,28 @@ define <3 x half> @utofp_v3i128_v3f16(<3 x i128> %a) {
; CHECK-GI-FP16-NEXT: bl __floatuntihf
; CHECK-GI-FP16-NEXT: mov x0, x19
; CHECK-GI-FP16-NEXT: mov x1, x20
-; CHECK-GI-FP16-NEXT: // kill: def $h0 killed $h0 def $q0
-; CHECK-GI-FP16-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT: fmov s8, s0
; CHECK-GI-FP16-NEXT: bl __floatuntihf
; CHECK-GI-FP16-NEXT: mov x0, x21
; CHECK-GI-FP16-NEXT: mov x1, x22
-; CHECK-GI-FP16-NEXT: // kill: def $h0 killed $h0 def $q0
-; CHECK-GI-FP16-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT: fmov s9, s0
; CHECK-GI-FP16-NEXT: bl __floatuntihf
-; CHECK-GI-FP16-NEXT: ldp q2, q1, [sp] // 32-byte Folded Reload
-; CHECK-GI-FP16-NEXT: // kill: def $h0 killed $h0 def $q0
-; CHECK-GI-FP16-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-GI-FP16-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-GI-FP16-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-FP16-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.16b, v1.16b
-; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-FP16-NEXT: add sp, sp, #80
+; CHECK-GI-FP16-NEXT: fmov w8, s8
+; CHECK-GI-FP16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-GI-FP16-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-FP16-NEXT: mov v1.s[0], w8
+; CHECK-GI-FP16-NEXT: fmov w8, s9
+; CHECK-GI-FP16-NEXT: mov v1.s[1], w8
+; CHECK-GI-FP16-NEXT: fmov w8, s0
+; CHECK-GI-FP16-NEXT: mov v1.s[2], w8
+; CHECK-GI-FP16-NEXT: mov w8, v1.s[1]
+; CHECK-GI-FP16-NEXT: mov w9, v1.s[2]
+; CHECK-GI-FP16-NEXT: mov v1.h[1], w8
+; CHECK-GI-FP16-NEXT: mov v1.h[2], w9
+; CHECK-GI-FP16-NEXT: fmov d0, d1
+; CHECK-GI-FP16-NEXT: ldp d9, d8, [sp], #64 // 16-byte Folded Reload
; CHECK-GI-FP16-NEXT: ret
entry:
%c = uitofp <3 x i128> %a to <3 x half>
@@ -6202,6 +6322,9 @@ define <2 x half> @stofp_v2i64_v2f16(<2 x i64> %a) {
; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0]
; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1]
; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: stofp_v2i64_v2f16:
@@ -6211,6 +6334,8 @@ define <2 x half> @stofp_v2i64_v2f16(<2 x i64> %a) {
; CHECK-GI-FP16-NEXT: fcvt h0, d0
; CHECK-GI-FP16-NEXT: fcvt h1, d1
; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT: mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0]
; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-FP16-NEXT: ret
entry:
@@ -6248,6 +6373,9 @@ define <2 x half> @utofp_v2i64_v2f16(<2 x i64> %a) {
; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0]
; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1]
; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: utofp_v2i64_v2f16:
@@ -6257,6 +6385,8 @@ define <2 x half> @utofp_v2i64_v2f16(<2 x i64> %a) {
; CHECK-GI-FP16-NEXT: fcvt h0, d0
; CHECK-GI-FP16-NEXT: fcvt h1, d1
; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT: mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0]
; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-FP16-NEXT: ret
entry:
@@ -6288,7 +6418,18 @@ define <3 x half> @stofp_v3i64_v3f16(<3 x i64> %a) {
; CHECK-GI-NOFP16-NEXT: scvtf v0.2d, v0.2d
; CHECK-GI-NOFP16-NEXT: fcvtn v0.2s, v0.2d
; CHECK-GI-NOFP16-NEXT: fcvtn2 v0.4s, v1.2d
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: umov w8, v1.h[0]
+; CHECK-GI-NOFP16-NEXT: umov w9, v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.s[0], w8
+; CHECK-GI-NOFP16-NEXT: umov w8, v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.s[1], w9
+; CHECK-GI-NOFP16-NEXT: mov v0.s[2], w8
+; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9
+; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: stofp_v3i64_v3f16:
@@ -6303,8 +6444,16 @@ define <3 x half> @stofp_v3i64_v3f16(<3 x i64> %a) {
; CHECK-GI-FP16-NEXT: mov d1, v0.d[1]
; CHECK-GI-FP16-NEXT: fcvt h0, d0
; CHECK-GI-FP16-NEXT: fcvt h1, d1
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.h[2], v2.h[0]
+; CHECK-GI-FP16-NEXT: fmov w8, s0
+; CHECK-GI-FP16-NEXT: mov v0.s[0], w8
+; CHECK-GI-FP16-NEXT: fmov w8, s1
+; CHECK-GI-FP16-NEXT: mov v0.s[1], w8
+; CHECK-GI-FP16-NEXT: fmov w8, s2
+; CHECK-GI-FP16-NEXT: mov v0.s[2], w8
+; CHECK-GI-FP16-NEXT: mov w8, v0.s[1]
+; CHECK-GI-FP16-NEXT: mov w9, v0.s[2]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], w8
+; CHECK-GI-FP16-NEXT: mov v0.h[2], w9
; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-FP16-NEXT: ret
entry:
@@ -6336,7 +6485,18 @@ define <3 x half> @utofp_v3i64_v3f16(<3 x i64> %a) {
; CHECK-GI-NOFP16-NEXT: ucvtf v0.2d, v0.2d
; CHECK-GI-NOFP16-NEXT: fcvtn v0.2s, v0.2d
; CHECK-GI-NOFP16-NEXT: fcvtn2 v0.4s, v1.2d
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: umov w8, v1.h[0]
+; CHECK-GI-NOFP16-NEXT: umov w9, v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.s[0], w8
+; CHECK-GI-NOFP16-NEXT: umov w8, v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.s[1], w9
+; CHECK-GI-NOFP16-NEXT: mov v0.s[2], w8
+; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9
+; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: utofp_v3i64_v3f16:
@@ -6351,8 +6511,16 @@ define <3 x half> @utofp_v3i64_v3f16(<3 x i64> %a) {
; CHECK-GI-FP16-NEXT: mov d1, v0.d[1]
; CHECK-GI-FP16-NEXT: fcvt h0, d0
; CHECK-GI-FP16-NEXT: fcvt h1, d1
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.h[2], v2.h[0]
+; CHECK-GI-FP16-NEXT: fmov w8, s0
+; CHECK-GI-FP16-NEXT: mov v0.s[0], w8
+; CHECK-GI-FP16-NEXT: fmov w8, s1
+; CHECK-GI-FP16-NEXT: mov v0.s[1], w8
+; CHECK-GI-FP16-NEXT: fmov w8, s2
+; CHECK-GI-FP16-NEXT: mov v0.s[2], w8
+; CHECK-GI-FP16-NEXT: mov w8, v0.s[1]
+; CHECK-GI-FP16-NEXT: mov w9, v0.s[2]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], w8
+; CHECK-GI-FP16-NEXT: mov v0.h[2], w9
; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-FP16-NEXT: ret
entry:
@@ -7184,6 +7352,9 @@ define <2 x half> @stofp_v2i32_v2f16(<2 x i32> %a) {
; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
; CHECK-GI-NEXT: fcvtn v0.4h, v1.4s
+; CHECK-GI-NEXT: mov h1, v0.h[1]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
%c = sitofp <2 x i32> %a to <2 x half>
@@ -7204,6 +7375,9 @@ define <2 x half> @utofp_v2i32_v2f16(<2 x i32> %a) {
; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
; CHECK-GI-NEXT: fcvtn v0.4h, v1.4s
+; CHECK-GI-NEXT: mov h1, v0.h[1]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
%c = uitofp <2 x i32> %a to <2 x half>
@@ -7211,22 +7385,62 @@ entry:
}
define <3 x half> @stofp_v3i32_v3f16(<3 x i32> %a) {
-; CHECK-LABEL: stofp_v3i32_v3f16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: scvtf v0.4s, v0.4s
-; CHECK-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: stofp_v3i32_v3f16:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: scvtf v0.4s, v0.4s
+; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: stofp_v3i32_v3f16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: scvtf v0.4s, v1.4s
+; CHECK-GI-NEXT: fcvtn v1.4h, v0.4s
+; CHECK-GI-NEXT: umov w8, v1.h[0]
+; CHECK-GI-NEXT: umov w9, v1.h[1]
+; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: umov w8, v1.h[2]
+; CHECK-GI-NEXT: mov v0.s[1], w9
+; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[2], w9
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
entry:
%c = sitofp <3 x i32> %a to <3 x half>
ret <3 x half> %c
}
define <3 x half> @utofp_v3i32_v3f16(<3 x i32> %a) {
-; CHECK-LABEL: utofp_v3i32_v3f16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ucvtf v0.4s, v0.4s
-; CHECK-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: utofp_v3i32_v3f16:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ucvtf v0.4s, v0.4s
+; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: utofp_v3i32_v3f16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: ucvtf v0.4s, v1.4s
+; CHECK-GI-NEXT: fcvtn v1.4h, v0.4s
+; CHECK-GI-NEXT: umov w8, v1.h[0]
+; CHECK-GI-NEXT: umov w9, v1.h[1]
+; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: umov w8, v1.h[2]
+; CHECK-GI-NEXT: mov v0.s[1], w9
+; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[2], w9
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
entry:
%c = uitofp <3 x i32> %a to <3 x half>
ret <3 x half> %c
@@ -7411,12 +7625,18 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) {
; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0]
; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1]
; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: stofp_v2i16_v2f16:
; CHECK-GI-FP16: // %bb.0: // %entry
; CHECK-GI-FP16-NEXT: uzp1 v0.4h, v0.4h, v0.4h
; CHECK-GI-FP16-NEXT: scvtf v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT: mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-FP16-NEXT: ret
entry:
%c = sitofp <2 x i16> %a to <2 x half>
@@ -7446,12 +7666,18 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) {
; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0]
; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1]
; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: utofp_v2i16_v2f16:
; CHECK-GI-FP16: // %bb.0: // %entry
; CHECK-GI-FP16-NEXT: uzp1 v0.4h, v0.4h, v0.4h
; CHECK-GI-FP16-NEXT: ucvtf v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT: mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-FP16-NEXT: ret
entry:
%c = uitofp <2 x i16> %a to <2 x half>
@@ -7473,14 +7699,44 @@ define <3 x half> @stofp_v3i16_v3f16(<3 x i16> %a) {
;
; CHECK-GI-NOFP16-LABEL: stofp_v3i16_v3f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: sshll v0.4s, v1.4h, #0
; CHECK-GI-NOFP16-NEXT: scvtf v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: umov w8, v1.h[0]
+; CHECK-GI-NOFP16-NEXT: umov w9, v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.s[0], w8
+; CHECK-GI-NOFP16-NEXT: umov w8, v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.s[1], w9
+; CHECK-GI-NOFP16-NEXT: mov v0.s[2], w8
+; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9
+; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: stofp_v3i16_v3f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: scvtf v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT: scvtf v1.4h, v1.4h
+; CHECK-GI-FP16-NEXT: umov w8, v1.h[0]
+; CHECK-GI-FP16-NEXT: umov w9, v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.s[0], w8
+; CHECK-GI-FP16-NEXT: umov w8, v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v0.s[1], w9
+; CHECK-GI-FP16-NEXT: mov v0.s[2], w8
+; CHECK-GI-FP16-NEXT: mov w8, v0.s[1]
+; CHECK-GI-FP16-NEXT: mov w9, v0.s[2]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], w8
+; CHECK-GI-FP16-NEXT: mov v0.h[2], w9
+; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-FP16-NEXT: ret
entry:
%c = sitofp <3 x i16> %a to <3 x half>
@@ -7502,14 +7758,44 @@ define <3 x half> @utofp_v3i16_v3f16(<3 x i16> %a) {
;
; CHECK-GI-NOFP16-LABEL: utofp_v3i16_v3f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: ushll v0.4s, v1.4h, #0
; CHECK-GI-NOFP16-NEXT: ucvtf v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: umov w8, v1.h[0]
+; CHECK-GI-NOFP16-NEXT: umov w9, v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.s[0], w8
+; CHECK-GI-NOFP16-NEXT: umov w8, v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.s[1], w9
+; CHECK-GI-NOFP16-NEXT: mov v0.s[2], w8
+; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9
+; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: utofp_v3i16_v3f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: ucvtf v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-FP16-NEXT: ucvtf v1.4h, v1.4h
+; CHECK-GI-FP16-NEXT: umov w8, v1.h[0]
+; CHECK-GI-FP16-NEXT: umov w9, v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.s[0], w8
+; CHECK-GI-FP16-NEXT: umov w8, v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v0.s[1], w9
+; CHECK-GI-FP16-NEXT: mov v0.s[2], w8
+; CHECK-GI-FP16-NEXT: mov w8, v0.s[1]
+; CHECK-GI-FP16-NEXT: mov w9, v0.s[2]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], w8
+; CHECK-GI-FP16-NEXT: mov v0.h[2], w9
+; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-FP16-NEXT: ret
entry:
%c = uitofp <3 x i16> %a to <3 x half>
@@ -7933,6 +8219,9 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) {
; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0]
; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1]
; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: stofp_v2i8_v2f16:
@@ -7941,6 +8230,9 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) {
; CHECK-GI-FP16-NEXT: shl v0.4h, v0.4h, #8
; CHECK-GI-FP16-NEXT: sshr v0.4h, v0.4h, #8
; CHECK-GI-FP16-NEXT: scvtf v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT: mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-FP16-NEXT: ret
entry:
%c = sitofp <2 x i8> %a to <2 x half>
@@ -7984,6 +8276,9 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) {
; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0]
; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1]
; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: utofp_v2i8_v2f16:
@@ -7992,6 +8287,9 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) {
; CHECK-GI-FP16-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-GI-FP16-NEXT: uzp1 v0.4h, v0.4h, v0.4h
; CHECK-GI-FP16-NEXT: ucvtf v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT: mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-FP16-NEXT: ret
entry:
%c = uitofp <2 x i8> %a to <2 x half>
@@ -8034,7 +8332,18 @@ define <3 x half> @stofp_v3i8_v3f16(<3 x i8> %a) {
; CHECK-GI-NOFP16-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-GI-NOFP16-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NOFP16-NEXT: scvtf v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: umov w8, v1.h[0]
+; CHECK-GI-NOFP16-NEXT: umov w9, v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.s[0], w8
+; CHECK-GI-NOFP16-NEXT: umov w8, v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.s[1], w9
+; CHECK-GI-NOFP16-NEXT: mov v0.s[2], w8
+; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9
+; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: stofp_v3i8_v3f16:
@@ -8043,7 +8352,18 @@ define <3 x half> @stofp_v3i8_v3f16(<3 x i8> %a) {
; CHECK-GI-FP16-NEXT: mov v0.b[1], w1
; CHECK-GI-FP16-NEXT: mov v0.b[2], w2
; CHECK-GI-FP16-NEXT: sshll v0.8h, v0.8b, #0
-; CHECK-GI-FP16-NEXT: scvtf v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT: scvtf v1.4h, v0.4h
+; CHECK-GI-FP16-NEXT: umov w8, v1.h[0]
+; CHECK-GI-FP16-NEXT: umov w9, v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.s[0], w8
+; CHECK-GI-FP16-NEXT: umov w8, v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v0.s[1], w9
+; CHECK-GI-FP16-NEXT: mov v0.s[2], w8
+; CHECK-GI-FP16-NEXT: mov w8, v0.s[1]
+; CHECK-GI-FP16-NEXT: mov w9, v0.s[2]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], w8
+; CHECK-GI-FP16-NEXT: mov v0.h[2], w9
+; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-FP16-NEXT: ret
entry:
%c = sitofp <3 x i8> %a to <3 x half>
@@ -8084,7 +8404,18 @@ define <3 x half> @utofp_v3i8_v3f16(<3 x i8> %a) {
; CHECK-GI-NOFP16-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-NOFP16-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NOFP16-NEXT: ucvtf v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: umov w8, v1.h[0]
+; CHECK-GI-NOFP16-NEXT: umov w9, v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.s[0], w8
+; CHECK-GI-NOFP16-NEXT: umov w8, v1.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.s[1], w9
+; CHECK-GI-NOFP16-NEXT: mov v0.s[2], w8
+; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9
+; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: utofp_v3i8_v3f16:
@@ -8093,7 +8424,18 @@ define <3 x half> @utofp_v3i8_v3f16(<3 x i8> %a) {
; CHECK-GI-FP16-NEXT: mov v0.b[1], w1
; CHECK-GI-FP16-NEXT: mov v0.b[2], w2
; CHECK-GI-FP16-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-GI-FP16-NEXT: ucvtf v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT: ucvtf v1.4h, v0.4h
+; CHECK-GI-FP16-NEXT: umov w8, v1.h[0]
+; CHECK-GI-FP16-NEXT: umov w9, v1.h[1]
+; CHECK-GI-FP16-NEXT: mov v0.s[0], w8
+; CHECK-GI-FP16-NEXT: umov w8, v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v0.s[1], w9
+; CHECK-GI-FP16-NEXT: mov v0.s[2], w8
+; CHECK-GI-FP16-NEXT: mov w8, v0.s[1]
+; CHECK-GI-FP16-NEXT: mov w9, v0.s[2]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], w8
+; CHECK-GI-FP16-NEXT: mov v0.h[2], w9
+; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-FP16-NEXT: ret
entry:
%c = uitofp <3 x i8> %a to <3 x half>
diff --git a/llvm/test/CodeGen/AArch64/llvm.exp10.ll b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
index c1ea891bc86e7e..33e8a85784d139 100644
--- a/llvm/test/CodeGen/AArch64/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
@@ -109,11 +109,14 @@ define <2 x half> @exp10_v2f16(<2 x half> %x) {
; GISEL-NEXT: str q0, [sp] // 16-byte Folded Spill
; GISEL-NEXT: fmov s0, s1
; GISEL-NEXT: bl exp10f
-; GISEL-NEXT: fcvt h1, s0
-; GISEL-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; GISEL-NEXT: fcvt h0, s0
+; GISEL-NEXT: ldr q1, [sp] // 16-byte Folded Reload
; GISEL-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload
; GISEL-NEXT: ldr d8, [sp, #16] // 8-byte Folded Reload
-; GISEL-NEXT: mov v0.h[1], v1.h[0]
+; GISEL-NEXT: mov v1.h[1], v0.h[0]
+; GISEL-NEXT: mov h0, v1.h[1]
+; GISEL-NEXT: mov v1.h[1], v0.h[0]
+; GISEL-NEXT: mov v0.16b, v1.16b
; GISEL-NEXT: // kill: def $d0 killed $d0 killed $q0
; GISEL-NEXT: add sp, sp, #32
; GISEL-NEXT: ret
@@ -165,10 +168,9 @@ define <3 x half> @exp10_v3f16(<3 x half> %x) {
;
; GISEL-LABEL: exp10_v3f16:
; GISEL: // %bb.0:
-; GISEL-NEXT: sub sp, sp, #64
-; GISEL-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
-; GISEL-NEXT: str x30, [sp, #48] // 8-byte Folded Spill
-; GISEL-NEXT: .cfi_def_cfa_offset 64
+; GISEL-NEXT: stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; GISEL-NEXT: str x30, [sp, #16] // 8-byte Folded Spill
+; GISEL-NEXT: .cfi_def_cfa_offset 32
; GISEL-NEXT: .cfi_offset w30, -16
; GISEL-NEXT: .cfi_offset b8, -24
; GISEL-NEXT: .cfi_offset b9, -32
@@ -178,24 +180,27 @@ define <3 x half> @exp10_v3f16(<3 x half> %x) {
; GISEL-NEXT: fcvt s0, h0
; GISEL-NEXT: bl exp10f
; GISEL-NEXT: fcvt s1, h8
-; GISEL-NEXT: fcvt h0, s0
-; GISEL-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; GISEL-NEXT: fcvt h8, s0
; GISEL-NEXT: fmov s0, s1
; GISEL-NEXT: bl exp10f
; GISEL-NEXT: fcvt s1, h9
-; GISEL-NEXT: fcvt h0, s0
-; GISEL-NEXT: str q0, [sp] // 16-byte Folded Spill
+; GISEL-NEXT: fcvt h9, s0
; GISEL-NEXT: fmov s0, s1
; GISEL-NEXT: bl exp10f
-; GISEL-NEXT: ldp q2, q1, [sp] // 32-byte Folded Reload
+; GISEL-NEXT: fmov w8, s8
; GISEL-NEXT: fcvt h0, s0
-; GISEL-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
-; GISEL-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload
-; GISEL-NEXT: mov v1.h[1], v2.h[0]
-; GISEL-NEXT: mov v1.h[2], v0.h[0]
-; GISEL-NEXT: mov v0.16b, v1.16b
-; GISEL-NEXT: // kill: def $d0 killed $d0 killed $q0
-; GISEL-NEXT: add sp, sp, #64
+; GISEL-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
+; GISEL-NEXT: mov v1.s[0], w8
+; GISEL-NEXT: fmov w8, s9
+; GISEL-NEXT: mov v1.s[1], w8
+; GISEL-NEXT: fmov w8, s0
+; GISEL-NEXT: mov v1.s[2], w8
+; GISEL-NEXT: mov w8, v1.s[1]
+; GISEL-NEXT: mov w9, v1.s[2]
+; GISEL-NEXT: mov v1.h[1], w8
+; GISEL-NEXT: mov v1.h[2], w9
+; GISEL-NEXT: fmov d0, d1
+; GISEL-NEXT: ldp d9, d8, [sp], #32 // 16-byte Folded Reload
; GISEL-NEXT: ret
%r = call <3 x half> @llvm.exp10.v3f16(<3 x half> %x)
ret <3 x half> %r
@@ -436,7 +441,9 @@ define <3 x float> @exp10_v3f32(<3 x float> %x) {
; GISEL-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
; GISEL-NEXT: mov v1.s[1], v2.s[0]
; GISEL-NEXT: mov v1.s[2], v0.s[0]
-; GISEL-NEXT: mov v0.16b, v1.16b
+; GISEL-NEXT: mov v0.s[0], v1.s[0]
+; GISEL-NEXT: mov v0.s[1], v1.s[1]
+; GISEL-NEXT: mov v0.s[2], v1.s[2]
; GISEL-NEXT: add sp, sp, #64
; GISEL-NEXT: ret
%r = call <3 x float> @llvm.exp10.v3f32(<3 x float> %x)
@@ -624,29 +631,33 @@ define <3 x double> @exp10_v3f64(<3 x double> %x) {
;
; GISEL-LABEL: exp10_v3f64:
; GISEL: // %bb.0:
-; GISEL-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill
-; GISEL-NEXT: stp d9, d8, [sp, #8] // 16-byte Folded Spill
-; GISEL-NEXT: str x30, [sp, #24] // 8-byte Folded Spill
-; GISEL-NEXT: .cfi_def_cfa_offset 32
-; GISEL-NEXT: .cfi_offset w30, -8
-; GISEL-NEXT: .cfi_offset b8, -16
-; GISEL-NEXT: .cfi_offset b9, -24
-; GISEL-NEXT: .cfi_offset b10, -32
+; GISEL-NEXT: sub sp, sp, #64
+; GISEL-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; GISEL-NEXT: str x30, [sp, #48] // 8-byte Folded Spill
+; GISEL-NEXT: .cfi_def_cfa_offset 64
+; GISEL-NEXT: .cfi_offset w30, -16
+; GISEL-NEXT: .cfi_offset b8, -24
+; GISEL-NEXT: .cfi_offset b9, -32
; GISEL-NEXT: fmov d8, d1
; GISEL-NEXT: fmov d9, d2
; GISEL-NEXT: bl exp10
-; GISEL-NEXT: fmov d10, d0
+; GISEL-NEXT: // kill: def $d0 killed $d0 def $q0
+; GISEL-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; GISEL-NEXT: fmov d0, d8
; GISEL-NEXT: bl exp10
-; GISEL-NEXT: fmov d8, d0
+; GISEL-NEXT: // kill: def $d0 killed $d0 def $q0
+; GISEL-NEXT: str q0, [sp] // 16-byte Folded Spill
; GISEL-NEXT: fmov d0, d9
; GISEL-NEXT: bl exp10
-; GISEL-NEXT: fmov d1, d8
-; GISEL-NEXT: ldp d9, d8, [sp, #8] // 16-byte Folded Reload
-; GISEL-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload
+; GISEL-NEXT: ldp q1, q3, [sp] // 32-byte Folded Reload
; GISEL-NEXT: fmov d2, d0
-; GISEL-NEXT: fmov d0, d10
-; GISEL-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload
+; GISEL-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; GISEL-NEXT: // kill: def $d2 killed $d2 killed $q2
+; GISEL-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload
+; GISEL-NEXT: mov v3.d[1], v1.d[0]
+; GISEL-NEXT: mov d1, v3.d[1]
+; GISEL-NEXT: fmov d0, d3
+; GISEL-NEXT: add sp, sp, #64
; GISEL-NEXT: ret
%r = call <3 x double> @llvm.exp10.v3f64(<3 x double> %x)
ret <3 x double> %r
diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll
index 70ab10e716875a..517cf7c4352fd3 100644
--- a/llvm/test/CodeGen/AArch64/load.ll
+++ b/llvm/test/CodeGen/AArch64/load.ll
@@ -215,10 +215,16 @@ define <3 x i8> @load_v3i8(ptr %ptr){
;
; CHECK-GI-LABEL: load_v3i8:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldrb w8, [x0]
-; CHECK-GI-NEXT: ldrb w1, [x0, #1]
-; CHECK-GI-NEXT: ldrb w2, [x0, #2]
-; CHECK-GI-NEXT: mov w0, w8
+; CHECK-GI-NEXT: ldr b0, [x0]
+; CHECK-GI-NEXT: ldr b1, [x0, #1]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT: ldr b1, [x0, #2]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT: mov s1, v0.s[1]
+; CHECK-GI-NEXT: mov s2, v0.s[2]
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: fmov w1, s1
+; CHECK-GI-NEXT: fmov w2, s2
; CHECK-GI-NEXT: ret
%a = load <3 x i8>, ptr %ptr
ret <3 x i8> %a
@@ -232,20 +238,38 @@ define <7 x i8> @load_v7i8(ptr %ptr){
;
; CHECK-GI-LABEL: load_v7i8:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr b0, [x0]
-; CHECK-GI-NEXT: ldr b1, [x0, #1]
-; CHECK-GI-NEXT: mov v0.b[0], v0.b[0]
-; CHECK-GI-NEXT: mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT: ldr b1, [x0, #2]
-; CHECK-GI-NEXT: mov v0.b[2], v1.b[0]
-; CHECK-GI-NEXT: ldr b1, [x0, #3]
-; CHECK-GI-NEXT: mov v0.b[3], v1.b[0]
-; CHECK-GI-NEXT: ldr b1, [x0, #4]
-; CHECK-GI-NEXT: mov v0.b[4], v1.b[0]
-; CHECK-GI-NEXT: ldr b1, [x0, #5]
-; CHECK-GI-NEXT: mov v0.b[5], v1.b[0]
-; CHECK-GI-NEXT: ldr b1, [x0, #6]
-; CHECK-GI-NEXT: mov v0.b[6], v1.b[0]
+; CHECK-GI-NEXT: ldrb w8, [x0]
+; CHECK-GI-NEXT: ldrb w9, [x0, #1]
+; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: ldrb w8, [x0, #2]
+; CHECK-GI-NEXT: mov v0.h[1], w9
+; CHECK-GI-NEXT: mov v0.h[2], w8
+; CHECK-GI-NEXT: ldrb w8, [x0, #3]
+; CHECK-GI-NEXT: mov v0.h[3], w8
+; CHECK-GI-NEXT: ldrb w8, [x0, #4]
+; CHECK-GI-NEXT: mov v0.h[4], w8
+; CHECK-GI-NEXT: ldrb w8, [x0, #5]
+; CHECK-GI-NEXT: mov v0.h[5], w8
+; CHECK-GI-NEXT: ldrb w8, [x0, #6]
+; CHECK-GI-NEXT: mov v0.h[6], w8
+; CHECK-GI-NEXT: mov h1, v0.h[1]
+; CHECK-GI-NEXT: mov h2, v0.h[3]
+; CHECK-GI-NEXT: mov h3, v0.h[4]
+; CHECK-GI-NEXT: mov h4, v0.h[5]
+; CHECK-GI-NEXT: mov h5, v0.h[6]
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov h1, v0.h[2]
+; CHECK-GI-NEXT: mov v0.b[1], w8
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov v0.b[2], w8
+; CHECK-GI-NEXT: fmov w8, s2
+; CHECK-GI-NEXT: mov v0.b[3], w8
+; CHECK-GI-NEXT: fmov w8, s3
+; CHECK-GI-NEXT: mov v0.b[4], w8
+; CHECK-GI-NEXT: fmov w8, s4
+; CHECK-GI-NEXT: mov v0.b[5], w8
+; CHECK-GI-NEXT: fmov w8, s5
+; CHECK-GI-NEXT: mov v0.b[6], w8
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
%a = load <7 x i8>, ptr %ptr
@@ -261,10 +285,14 @@ define <3 x i16> @load_v3i16(ptr %ptr){
; CHECK-GI-LABEL: load_v3i16:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: add x8, x0, #2
-; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT: add x8, x0, #4
-; CHECK-GI-NEXT: ld1 { v0.h }[2], [x8]
+; CHECK-GI-NEXT: ldr h1, [x0, #2]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT: ldr h1, [x0, #4]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[2], w9
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
%a = load <3 x i16>, ptr %ptr
@@ -279,19 +307,26 @@ define <7 x i16> @load_v7i16(ptr %ptr){
;
; CHECK-GI-LABEL: load_v7i16:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: add x8, x0, #2
-; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT: ldr h1, [x0]
+; CHECK-GI-NEXT: ldr h0, [x0, #2]
; CHECK-GI-NEXT: add x8, x0, #4
-; CHECK-GI-NEXT: ld1 { v0.h }[2], [x8]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[0]
+; CHECK-GI-NEXT: ld1 { v1.h }[2], [x8]
; CHECK-GI-NEXT: add x8, x0, #6
-; CHECK-GI-NEXT: ld1 { v0.h }[3], [x8]
+; CHECK-GI-NEXT: ld1 { v1.h }[3], [x8]
; CHECK-GI-NEXT: add x8, x0, #8
-; CHECK-GI-NEXT: ld1 { v0.h }[4], [x8]
+; CHECK-GI-NEXT: ld1 { v1.h }[4], [x8]
; CHECK-GI-NEXT: add x8, x0, #10
-; CHECK-GI-NEXT: ld1 { v0.h }[5], [x8]
+; CHECK-GI-NEXT: ld1 { v1.h }[5], [x8]
; CHECK-GI-NEXT: add x8, x0, #12
-; CHECK-GI-NEXT: ld1 { v0.h }[6], [x8]
+; CHECK-GI-NEXT: ld1 { v1.h }[6], [x8]
+; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
; CHECK-GI-NEXT: ret
%a = load <7 x i16>, ptr %ptr
ret <7 x i16> %a
@@ -305,11 +340,14 @@ define <3 x i32> @load_v3i32(ptr %ptr){
;
; CHECK-GI-LABEL: load_v3i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr s0, [x0]
+; CHECK-GI-NEXT: ldr s1, [x0]
; CHECK-GI-NEXT: add x8, x0, #4
-; CHECK-GI-NEXT: ld1 { v0.s }[1], [x8]
+; CHECK-GI-NEXT: ld1 { v1.s }[1], [x8]
; CHECK-GI-NEXT: add x8, x0, #8
-; CHECK-GI-NEXT: ld1 { v0.s }[2], [x8]
+; CHECK-GI-NEXT: ld1 { v1.s }[2], [x8]
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: ret
%a = load <3 x i32>, ptr %ptr
ret <3 x i32> %a
diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll
index 9ca975d9e742e1..9735354402aabf 100644
--- a/llvm/test/CodeGen/AArch64/mul.ll
+++ b/llvm/test/CodeGen/AArch64/mul.ll
@@ -355,10 +355,24 @@ entry:
}
define <3 x i32> @v3i32(<3 x i32> %d, <3 x i32> %e) {
-; CHECK-LABEL: v3i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: v3i32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: v3i32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT: mul v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%s = mul <3 x i32> %d, %e
ret <3 x i32> %s
@@ -457,14 +471,15 @@ define <3 x i64> @v3i64(<3 x i64> %d, <3 x i64> %e) {
; CHECK-GI-NEXT: mov x11, v3.d[1]
; CHECK-GI-NEXT: mul x8, x8, x9
; CHECK-GI-NEXT: mul x9, x10, x11
+; CHECK-GI-NEXT: fmov x10, d5
; CHECK-GI-NEXT: mov v0.d[0], x8
; CHECK-GI-NEXT: fmov x8, d2
+; CHECK-GI-NEXT: mul x8, x8, x10
; CHECK-GI-NEXT: mov v0.d[1], x9
-; CHECK-GI-NEXT: fmov x9, d5
-; CHECK-GI-NEXT: mul x8, x8, x9
; CHECK-GI-NEXT: mov d1, v0.d[1]
+; CHECK-GI-NEXT: mov v2.d[0], x8
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: fmov d2, x8
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: ret
entry:
%s = mul <3 x i64> %d, %e
diff --git a/llvm/test/CodeGen/AArch64/rem.ll b/llvm/test/CodeGen/AArch64/rem.ll
index d807635f5d87d1..ad83cc81720728 100644
--- a/llvm/test/CodeGen/AArch64/rem.ll
+++ b/llvm/test/CodeGen/AArch64/rem.ll
@@ -227,10 +227,18 @@ define <3 x i8> @sv3i8(<3 x i8> %d, <3 x i8> %e) {
; CHECK-GI-NEXT: sxtb w15, w5
; CHECK-GI-NEXT: sdiv w10, w8, w9
; CHECK-GI-NEXT: sdiv w13, w11, w12
-; CHECK-GI-NEXT: msub w0, w10, w9, w8
-; CHECK-GI-NEXT: sdiv w16, w14, w15
-; CHECK-GI-NEXT: msub w1, w13, w12, w11
-; CHECK-GI-NEXT: msub w2, w16, w15, w14
+; CHECK-GI-NEXT: msub w8, w10, w9, w8
+; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: sdiv w9, w14, w15
+; CHECK-GI-NEXT: msub w8, w13, w12, w11
+; CHECK-GI-NEXT: mov v0.s[1], w8
+; CHECK-GI-NEXT: msub w8, w9, w15, w14
+; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: mov s1, v0.s[1]
+; CHECK-GI-NEXT: mov s2, v0.s[2]
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: fmov w1, s1
+; CHECK-GI-NEXT: fmov w2, s2
; CHECK-GI-NEXT: ret
entry:
%s = srem <3 x i8> %d, %e
@@ -1141,15 +1149,23 @@ define <3 x i8> @uv3i8(<3 x i8> %d, <3 x i8> %e) {
; CHECK-GI-NEXT: and w8, w0, #0xff
; CHECK-GI-NEXT: and w9, w3, #0xff
; CHECK-GI-NEXT: and w11, w1, #0xff
+; CHECK-GI-NEXT: udiv w10, w8, w9
; CHECK-GI-NEXT: and w12, w4, #0xff
; CHECK-GI-NEXT: and w14, w2, #0xff
; CHECK-GI-NEXT: and w15, w5, #0xff
-; CHECK-GI-NEXT: udiv w10, w8, w9
; CHECK-GI-NEXT: udiv w13, w11, w12
-; CHECK-GI-NEXT: msub w0, w10, w9, w8
-; CHECK-GI-NEXT: udiv w16, w14, w15
-; CHECK-GI-NEXT: msub w1, w13, w12, w11
-; CHECK-GI-NEXT: msub w2, w16, w15, w14
+; CHECK-GI-NEXT: msub w8, w10, w9, w8
+; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: udiv w9, w14, w15
+; CHECK-GI-NEXT: msub w8, w13, w12, w11
+; CHECK-GI-NEXT: mov v0.s[1], w8
+; CHECK-GI-NEXT: msub w8, w9, w15, w14
+; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: mov s1, v0.s[1]
+; CHECK-GI-NEXT: mov s2, v0.s[2]
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: fmov w1, s1
+; CHECK-GI-NEXT: fmov w2, s2
; CHECK-GI-NEXT: ret
entry:
%s = urem <3 x i8> %d, %e
@@ -2075,12 +2091,16 @@ define <3 x i16> @sv3i16(<3 x i16> %d, <3 x i16> %e) {
; CHECK-GI-NEXT: sdiv w10, w8, w9
; CHECK-GI-NEXT: sdiv w13, w11, w12
; CHECK-GI-NEXT: msub w8, w10, w9, w8
-; CHECK-GI-NEXT: fmov s0, w8
-; CHECK-GI-NEXT: sdiv w16, w14, w15
-; CHECK-GI-NEXT: msub w9, w13, w12, w11
-; CHECK-GI-NEXT: mov v0.h[1], w9
-; CHECK-GI-NEXT: msub w8, w16, w15, w14
-; CHECK-GI-NEXT: mov v0.h[2], w8
+; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: sdiv w9, w14, w15
+; CHECK-GI-NEXT: msub w8, w13, w12, w11
+; CHECK-GI-NEXT: mov v0.s[1], w8
+; CHECK-GI-NEXT: msub w8, w9, w15, w14
+; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[2], w9
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
@@ -2543,12 +2563,16 @@ define <3 x i16> @uv3i16(<3 x i16> %d, <3 x i16> %e) {
; CHECK-GI-NEXT: udiv w10, w8, w9
; CHECK-GI-NEXT: udiv w13, w11, w12
; CHECK-GI-NEXT: msub w8, w10, w9, w8
-; CHECK-GI-NEXT: fmov s0, w8
-; CHECK-GI-NEXT: udiv w16, w14, w15
-; CHECK-GI-NEXT: msub w9, w13, w12, w11
-; CHECK-GI-NEXT: mov v0.h[1], w9
-; CHECK-GI-NEXT: msub w8, w16, w15, w14
-; CHECK-GI-NEXT: mov v0.h[2], w8
+; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: udiv w9, w14, w15
+; CHECK-GI-NEXT: msub w8, w13, w12, w11
+; CHECK-GI-NEXT: mov v0.s[1], w8
+; CHECK-GI-NEXT: msub w8, w9, w15, w14
+; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[2], w9
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
@@ -3003,12 +3027,15 @@ define <3 x i32> @sv3i32(<3 x i32> %d, <3 x i32> %e) {
; CHECK-GI-NEXT: fmov w15, s1
; CHECK-GI-NEXT: sdiv w13, w11, w12
; CHECK-GI-NEXT: msub w8, w10, w9, w8
-; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: mov v1.s[0], w8
; CHECK-GI-NEXT: sdiv w9, w14, w15
; CHECK-GI-NEXT: msub w8, w13, w12, w11
-; CHECK-GI-NEXT: mov v0.s[1], w8
+; CHECK-GI-NEXT: mov v1.s[1], w8
; CHECK-GI-NEXT: msub w8, w9, w15, w14
-; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: mov v1.s[2], w8
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: ret
entry:
%s = srem <3 x i32> %d, %e
@@ -3234,12 +3261,15 @@ define <3 x i32> @uv3i32(<3 x i32> %d, <3 x i32> %e) {
; CHECK-GI-NEXT: fmov w15, s1
; CHECK-GI-NEXT: udiv w13, w11, w12
; CHECK-GI-NEXT: msub w8, w10, w9, w8
-; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: mov v1.s[0], w8
; CHECK-GI-NEXT: udiv w9, w14, w15
; CHECK-GI-NEXT: msub w8, w13, w12, w11
-; CHECK-GI-NEXT: mov v0.s[1], w8
+; CHECK-GI-NEXT: mov v1.s[1], w8
; CHECK-GI-NEXT: msub w8, w9, w15, w14
-; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: mov v1.s[2], w8
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: ret
entry:
%s = urem <3 x i32> %d, %e
@@ -3469,25 +3499,26 @@ define <3 x i64> @sv3i64(<3 x i64> %d, <3 x i64> %e) {
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: sdiv x8, x8, x9
; CHECK-GI-NEXT: fmov x9, d1
-; CHECK-GI-NEXT: fmov x11, d3
+; CHECK-GI-NEXT: fmov x12, d3
; CHECK-GI-NEXT: mov x14, v3.d[1]
; CHECK-GI-NEXT: sdiv x9, x9, x10
; CHECK-GI-NEXT: mov v6.d[0], x8
; CHECK-GI-NEXT: fmov x8, d2
+; CHECK-GI-NEXT: fmov x10, d5
; CHECK-GI-NEXT: mov v6.d[1], x9
-; CHECK-GI-NEXT: fmov x9, d5
-; CHECK-GI-NEXT: sdiv x12, x8, x9
-; CHECK-GI-NEXT: fmov x10, d6
+; CHECK-GI-NEXT: sdiv x9, x8, x10
+; CHECK-GI-NEXT: fmov x11, d6
; CHECK-GI-NEXT: mov x13, v6.d[1]
-; CHECK-GI-NEXT: mul x10, x10, x11
-; CHECK-GI-NEXT: mul x11, x13, x14
-; CHECK-GI-NEXT: mov v2.d[0], x10
-; CHECK-GI-NEXT: mov v2.d[1], x11
-; CHECK-GI-NEXT: msub x8, x12, x9, x8
+; CHECK-GI-NEXT: mul x11, x11, x12
+; CHECK-GI-NEXT: mul x12, x13, x14
+; CHECK-GI-NEXT: mov v2.d[0], x11
+; CHECK-GI-NEXT: mov v2.d[1], x12
+; CHECK-GI-NEXT: msub x8, x9, x10, x8
; CHECK-GI-NEXT: sub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT: mov v2.d[0], x8
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: fmov d2, x8
; CHECK-GI-NEXT: ret
entry:
%s = srem <3 x i64> %d, %e
@@ -3634,25 +3665,26 @@ define <3 x i64> @uv3i64(<3 x i64> %d, <3 x i64> %e) {
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: udiv x8, x8, x9
; CHECK-GI-NEXT: fmov x9, d1
-; CHECK-GI-NEXT: fmov x11, d3
+; CHECK-GI-NEXT: fmov x12, d3
; CHECK-GI-NEXT: mov x14, v3.d[1]
; CHECK-GI-NEXT: udiv x9, x9, x10
; CHECK-GI-NEXT: mov v6.d[0], x8
; CHECK-GI-NEXT: fmov x8, d2
+; CHECK-GI-NEXT: fmov x10, d5
; CHECK-GI-NEXT: mov v6.d[1], x9
-; CHECK-GI-NEXT: fmov x9, d5
-; CHECK-GI-NEXT: udiv x12, x8, x9
-; CHECK-GI-NEXT: fmov x10, d6
+; CHECK-GI-NEXT: udiv x9, x8, x10
+; CHECK-GI-NEXT: fmov x11, d6
; CHECK-GI-NEXT: mov x13, v6.d[1]
-; CHECK-GI-NEXT: mul x10, x10, x11
-; CHECK-GI-NEXT: mul x11, x13, x14
-; CHECK-GI-NEXT: mov v2.d[0], x10
-; CHECK-GI-NEXT: mov v2.d[1], x11
-; CHECK-GI-NEXT: msub x8, x12, x9, x8
+; CHECK-GI-NEXT: mul x11, x11, x12
+; CHECK-GI-NEXT: mul x12, x13, x14
+; CHECK-GI-NEXT: mov v2.d[0], x11
+; CHECK-GI-NEXT: mov v2.d[1], x12
+; CHECK-GI-NEXT: msub x8, x9, x10, x8
; CHECK-GI-NEXT: sub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT: mov v2.d[0], x8
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: fmov d2, x8
; CHECK-GI-NEXT: ret
entry:
%s = urem <3 x i64> %d, %e
diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll
index 853ed92c91fbcd..ca38f3b701084d 100644
--- a/llvm/test/CodeGen/AArch64/sext.ll
+++ b/llvm/test/CodeGen/AArch64/sext.ll
@@ -219,18 +219,16 @@ define <3 x i16> @sext_v3i8_v3i16(<3 x i8> %a) {
;
; CHECK-GI-LABEL: sext_v3i8_v3i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: lsl w8, w0, #8
-; CHECK-GI-NEXT: lsl w9, w1, #8
-; CHECK-GI-NEXT: lsl w10, w2, #8
-; CHECK-GI-NEXT: sxth w8, w8
-; CHECK-GI-NEXT: sxth w9, w9
-; CHECK-GI-NEXT: asr w8, w8, #8
-; CHECK-GI-NEXT: asr w9, w9, #8
-; CHECK-GI-NEXT: fmov s0, w8
-; CHECK-GI-NEXT: sxth w8, w10
-; CHECK-GI-NEXT: asr w8, w8, #8
-; CHECK-GI-NEXT: mov v0.h[1], w9
-; CHECK-GI-NEXT: mov v0.h[2], w8
+; CHECK-GI-NEXT: sxtb w8, w0
+; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: sxtb w8, w1
+; CHECK-GI-NEXT: mov v0.s[1], w8
+; CHECK-GI-NEXT: sxtb w8, w2
+; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[2], w9
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
@@ -252,11 +250,14 @@ define <3 x i32> @sext_v3i8_v3i32(<3 x i8> %a) {
; CHECK-GI-LABEL: sext_v3i8_v3i32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: sxtb w8, w0
-; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: mov v1.s[0], w8
; CHECK-GI-NEXT: sxtb w8, w1
-; CHECK-GI-NEXT: mov v0.s[1], w8
+; CHECK-GI-NEXT: mov v1.s[1], w8
; CHECK-GI-NEXT: sxtb w8, w2
-; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: mov v1.s[2], w8
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: ret
entry:
%c = sext <3 x i8> %a to <3 x i32>
@@ -284,14 +285,17 @@ define <3 x i64> @sext_v3i8_v3i64(<3 x i8> %a) {
; CHECK-GI-LABEL: sext_v3i8_v3i64:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-GI-NEXT: sxtb x8, w0
; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2
-; CHECK-GI-NEXT: sxtb x8, w0
-; CHECK-GI-NEXT: sxtb x9, w1
-; CHECK-GI-NEXT: sxtb x10, w2
-; CHECK-GI-NEXT: fmov d0, x8
-; CHECK-GI-NEXT: fmov d1, x9
-; CHECK-GI-NEXT: fmov d2, x10
+; CHECK-GI-NEXT: mov v0.d[0], x8
+; CHECK-GI-NEXT: sxtb x8, w1
+; CHECK-GI-NEXT: mov v0.d[1], x8
+; CHECK-GI-NEXT: sxtb x8, w2
+; CHECK-GI-NEXT: mov v2.d[0], x8
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT: mov d1, v0.d[1]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
%c = sext <3 x i8> %a to <3 x i64>
@@ -313,7 +317,9 @@ define <3 x i32> @sext_v3i16_v3i32(<3 x i16> %a) {
; CHECK-GI-NEXT: smov w8, v0.h[2]
; CHECK-GI-NEXT: mov v1.s[1], w9
; CHECK-GI-NEXT: mov v1.s[2], w8
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: ret
entry:
%c = sext <3 x i16> %a to <3 x i32>
@@ -337,10 +343,13 @@ define <3 x i64> @sext_v3i16_v3i64(<3 x i16> %a) {
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: smov x8, v0.h[0]
; CHECK-GI-NEXT: smov x9, v0.h[1]
-; CHECK-GI-NEXT: smov x10, v0.h[2]
-; CHECK-GI-NEXT: fmov d0, x8
-; CHECK-GI-NEXT: fmov d1, x9
-; CHECK-GI-NEXT: fmov d2, x10
+; CHECK-GI-NEXT: mov v3.d[0], x8
+; CHECK-GI-NEXT: smov x8, v0.h[2]
+; CHECK-GI-NEXT: mov v3.d[1], x9
+; CHECK-GI-NEXT: mov v2.d[0], x8
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT: mov d1, v3.d[1]
+; CHECK-GI-NEXT: fmov d0, d3
; CHECK-GI-NEXT: ret
entry:
%c = sext <3 x i16> %a to <3 x i64>
@@ -362,10 +371,13 @@ define <3 x i64> @sext_v3i32_v3i64(<3 x i32> %a) {
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: smov x8, v0.s[0]
; CHECK-GI-NEXT: smov x9, v0.s[1]
-; CHECK-GI-NEXT: smov x10, v0.s[2]
-; CHECK-GI-NEXT: fmov d0, x8
-; CHECK-GI-NEXT: fmov d1, x9
-; CHECK-GI-NEXT: fmov d2, x10
+; CHECK-GI-NEXT: mov v3.d[0], x8
+; CHECK-GI-NEXT: smov x8, v0.s[2]
+; CHECK-GI-NEXT: mov v3.d[1], x9
+; CHECK-GI-NEXT: mov v2.d[0], x8
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT: mov d1, v3.d[1]
+; CHECK-GI-NEXT: fmov d0, d3
; CHECK-GI-NEXT: ret
entry:
%c = sext <3 x i32> %a to <3 x i64>
@@ -384,18 +396,16 @@ define <3 x i16> @sext_v3i10_v3i16(<3 x i10> %a) {
;
; CHECK-GI-LABEL: sext_v3i10_v3i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: lsl w8, w0, #6
-; CHECK-GI-NEXT: lsl w9, w1, #6
-; CHECK-GI-NEXT: lsl w10, w2, #6
-; CHECK-GI-NEXT: sxth w8, w8
-; CHECK-GI-NEXT: sxth w9, w9
-; CHECK-GI-NEXT: asr w8, w8, #6
-; CHECK-GI-NEXT: asr w9, w9, #6
-; CHECK-GI-NEXT: fmov s0, w8
-; CHECK-GI-NEXT: sxth w8, w10
-; CHECK-GI-NEXT: asr w8, w8, #6
-; CHECK-GI-NEXT: mov v0.h[1], w9
-; CHECK-GI-NEXT: mov v0.h[2], w8
+; CHECK-GI-NEXT: sbfx w8, w0, #0, #10
+; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: sbfx w8, w1, #0, #10
+; CHECK-GI-NEXT: mov v0.s[1], w8
+; CHECK-GI-NEXT: sbfx w8, w2, #0, #10
+; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[2], w9
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
@@ -417,11 +427,14 @@ define <3 x i32> @sext_v3i10_v3i32(<3 x i10> %a) {
; CHECK-GI-LABEL: sext_v3i10_v3i32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: sbfx w8, w0, #0, #10
-; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: mov v1.s[0], w8
; CHECK-GI-NEXT: sbfx w8, w1, #0, #10
-; CHECK-GI-NEXT: mov v0.s[1], w8
+; CHECK-GI-NEXT: mov v1.s[1], w8
; CHECK-GI-NEXT: sbfx w8, w2, #0, #10
-; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: mov v1.s[2], w8
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: ret
entry:
%c = sext <3 x i10> %a to <3 x i32>
@@ -449,14 +462,17 @@ define <3 x i64> @sext_v3i10_v3i64(<3 x i10> %a) {
; CHECK-GI-LABEL: sext_v3i10_v3i64:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-GI-NEXT: sbfx x8, x0, #0, #10
; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2
-; CHECK-GI-NEXT: sbfx x8, x0, #0, #10
-; CHECK-GI-NEXT: sbfx x9, x1, #0, #10
-; CHECK-GI-NEXT: sbfx x10, x2, #0, #10
-; CHECK-GI-NEXT: fmov d0, x8
-; CHECK-GI-NEXT: fmov d1, x9
-; CHECK-GI-NEXT: fmov d2, x10
+; CHECK-GI-NEXT: mov v0.d[0], x8
+; CHECK-GI-NEXT: sbfx x8, x1, #0, #10
+; CHECK-GI-NEXT: mov v0.d[1], x8
+; CHECK-GI-NEXT: sbfx x8, x2, #0, #10
+; CHECK-GI-NEXT: mov v2.d[0], x8
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT: mov d1, v0.d[1]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
%c = sext <3 x i10> %a to <3 x i64>
diff --git a/llvm/test/CodeGen/AArch64/shift.ll b/llvm/test/CodeGen/AArch64/shift.ll
index 066928687cc02d..c8344a39da56a7 100644
--- a/llvm/test/CodeGen/AArch64/shift.ll
+++ b/llvm/test/CodeGen/AArch64/shift.ll
@@ -1069,46 +1069,188 @@ define <3 x i8> @shl_v3i8(<3 x i8> %0, <3 x i8> %1){
; CHECK-GI-NEXT: mov v0.b[2], w2
; CHECK-GI-NEXT: mov v1.b[2], w5
; CHECK-GI-NEXT: ushl v0.8b, v0.8b, v1.8b
-; CHECK-GI-NEXT: umov w0, v0.b[0]
-; CHECK-GI-NEXT: umov w1, v0.b[1]
-; CHECK-GI-NEXT: umov w2, v0.b[2]
+; CHECK-GI-NEXT: umov w8, v0.b[0]
+; CHECK-GI-NEXT: umov w9, v0.b[1]
+; CHECK-GI-NEXT: mov v1.s[0], w8
+; CHECK-GI-NEXT: umov w8, v0.b[2]
+; CHECK-GI-NEXT: mov v1.s[1], w9
+; CHECK-GI-NEXT: mov v1.s[2], w8
+; CHECK-GI-NEXT: mov s0, v1.s[1]
+; CHECK-GI-NEXT: mov s2, v1.s[2]
+; CHECK-GI-NEXT: fmov w0, s1
+; CHECK-GI-NEXT: fmov w1, s0
+; CHECK-GI-NEXT: fmov w2, s2
; CHECK-GI-NEXT: ret
%3 = shl <3 x i8> %0, %1
ret <3 x i8> %3
}
define <7 x i8> @shl_v7i8(<7 x i8> %0, <7 x i8> %1){
-; CHECK-LABEL: shl_v7i8:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ushl v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: shl_v7i8:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ushl v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: shl_v7i8:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT: mov b3, v0.b[1]
+; CHECK-GI-NEXT: mov b4, v1.b[1]
+; CHECK-GI-NEXT: mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT: mov v5.b[0], v1.b[0]
+; CHECK-GI-NEXT: mov b6, v0.b[2]
+; CHECK-GI-NEXT: mov b7, v1.b[2]
+; CHECK-GI-NEXT: mov v2.b[1], v3.b[0]
+; CHECK-GI-NEXT: mov b3, v0.b[3]
+; CHECK-GI-NEXT: mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT: mov b4, v1.b[3]
+; CHECK-GI-NEXT: mov v2.b[2], v6.b[0]
+; CHECK-GI-NEXT: mov b6, v0.b[4]
+; CHECK-GI-NEXT: mov v5.b[2], v7.b[0]
+; CHECK-GI-NEXT: mov b7, v1.b[4]
+; CHECK-GI-NEXT: mov v2.b[3], v3.b[0]
+; CHECK-GI-NEXT: mov b3, v0.b[5]
+; CHECK-GI-NEXT: mov b0, v0.b[6]
+; CHECK-GI-NEXT: mov v5.b[3], v4.b[0]
+; CHECK-GI-NEXT: mov b4, v1.b[5]
+; CHECK-GI-NEXT: mov b1, v1.b[6]
+; CHECK-GI-NEXT: mov v2.b[4], v6.b[0]
+; CHECK-GI-NEXT: mov v5.b[4], v7.b[0]
+; CHECK-GI-NEXT: mov v2.b[5], v3.b[0]
+; CHECK-GI-NEXT: mov v5.b[5], v4.b[0]
+; CHECK-GI-NEXT: mov v2.b[6], v0.b[0]
+; CHECK-GI-NEXT: mov v5.b[6], v1.b[0]
+; CHECK-GI-NEXT: ushl v0.8b, v2.8b, v5.8b
+; CHECK-GI-NEXT: mov b1, v0.b[1]
+; CHECK-GI-NEXT: mov b2, v0.b[2]
+; CHECK-GI-NEXT: mov b3, v0.b[3]
+; CHECK-GI-NEXT: mov b4, v0.b[4]
+; CHECK-GI-NEXT: mov b5, v0.b[6]
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov b1, v0.b[5]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: fmov w8, s2
+; CHECK-GI-NEXT: mov v0.h[2], w8
+; CHECK-GI-NEXT: fmov w8, s3
+; CHECK-GI-NEXT: mov v0.h[3], w8
+; CHECK-GI-NEXT: fmov w8, s4
+; CHECK-GI-NEXT: mov v0.h[4], w8
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov v0.h[5], w8
+; CHECK-GI-NEXT: fmov w8, s5
+; CHECK-GI-NEXT: mov v0.h[6], w8
+; CHECK-GI-NEXT: mov h1, v0.h[1]
+; CHECK-GI-NEXT: mov h2, v0.h[3]
+; CHECK-GI-NEXT: mov h3, v0.h[4]
+; CHECK-GI-NEXT: mov h4, v0.h[5]
+; CHECK-GI-NEXT: mov h5, v0.h[6]
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov h1, v0.h[2]
+; CHECK-GI-NEXT: mov v0.b[1], w8
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov v0.b[2], w8
+; CHECK-GI-NEXT: fmov w8, s2
+; CHECK-GI-NEXT: mov v0.b[3], w8
+; CHECK-GI-NEXT: fmov w8, s3
+; CHECK-GI-NEXT: mov v0.b[4], w8
+; CHECK-GI-NEXT: fmov w8, s4
+; CHECK-GI-NEXT: mov v0.b[5], w8
+; CHECK-GI-NEXT: fmov w8, s5
+; CHECK-GI-NEXT: mov v0.b[6], w8
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
%3 = shl <7 x i8> %0, %1
ret <7 x i8> %3
}
define <3 x i16> @shl_v3i16(<3 x i16> %0, <3 x i16> %1){
-; CHECK-LABEL: shl_v3i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ushl v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: shl_v3i16:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ushl v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: shl_v3i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-GI-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-GI-NEXT: ushl v1.4h, v2.4h, v3.4h
+; CHECK-GI-NEXT: umov w8, v1.h[0]
+; CHECK-GI-NEXT: umov w9, v1.h[1]
+; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: umov w8, v1.h[2]
+; CHECK-GI-NEXT: mov v0.s[1], w9
+; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[2], w9
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
%3 = shl <3 x i16> %0, %1
ret <3 x i16> %3
}
define <7 x i16> @shl_v7i16(<7 x i16> %0, <7 x i16> %1){
-; CHECK-LABEL: shl_v7i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ushl v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: shl_v7i16:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ushl v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: shl_v7i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-GI-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v2.h[3], v0.h[3]
+; CHECK-GI-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-GI-NEXT: mov v2.h[4], v0.h[4]
+; CHECK-GI-NEXT: mov v3.h[4], v1.h[4]
+; CHECK-GI-NEXT: mov v2.h[5], v0.h[5]
+; CHECK-GI-NEXT: mov v3.h[5], v1.h[5]
+; CHECK-GI-NEXT: mov v2.h[6], v0.h[6]
+; CHECK-GI-NEXT: mov v3.h[6], v1.h[6]
+; CHECK-GI-NEXT: ushl v1.8h, v2.8h, v3.8h
+; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT: ret
%3 = shl <7 x i16> %0, %1
ret <7 x i16> %3
}
define <3 x i32> @shl_v3i32(<3 x i32> %0, <3 x i32> %1){
-; CHECK-LABEL: shl_v3i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ushl v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: shl_v3i32:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ushl v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: shl_v3i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT: ushl v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
%3 = shl <3 x i32> %0, %1
ret <3 x i32> %3
}
@@ -1142,50 +1284,196 @@ define <3 x i8> @ashr_v3i8(<3 x i8> %0, <3 x i8> %1){
; CHECK-GI-NEXT: mov v1.b[2], w2
; CHECK-GI-NEXT: neg v0.8b, v0.8b
; CHECK-GI-NEXT: sshl v0.8b, v1.8b, v0.8b
-; CHECK-GI-NEXT: umov w0, v0.b[0]
-; CHECK-GI-NEXT: umov w1, v0.b[1]
-; CHECK-GI-NEXT: umov w2, v0.b[2]
+; CHECK-GI-NEXT: umov w8, v0.b[0]
+; CHECK-GI-NEXT: umov w9, v0.b[1]
+; CHECK-GI-NEXT: mov v1.s[0], w8
+; CHECK-GI-NEXT: umov w8, v0.b[2]
+; CHECK-GI-NEXT: mov v1.s[1], w9
+; CHECK-GI-NEXT: mov v1.s[2], w8
+; CHECK-GI-NEXT: mov s0, v1.s[1]
+; CHECK-GI-NEXT: mov s2, v1.s[2]
+; CHECK-GI-NEXT: fmov w0, s1
+; CHECK-GI-NEXT: fmov w1, s0
+; CHECK-GI-NEXT: fmov w2, s2
; CHECK-GI-NEXT: ret
%3 = ashr <3 x i8> %0, %1
ret <3 x i8> %3
}
define <7 x i8> @ashr_v7i8(<7 x i8> %0, <7 x i8> %1){
-; CHECK-LABEL: ashr_v7i8:
-; CHECK: // %bb.0:
-; CHECK-NEXT: neg v1.8b, v1.8b
-; CHECK-NEXT: sshl v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ashr_v7i8:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: neg v1.8b, v1.8b
+; CHECK-SD-NEXT: sshl v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ashr_v7i8:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT: mov b2, v1.b[1]
+; CHECK-GI-NEXT: mov v3.b[0], v1.b[0]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov b4, v0.b[1]
+; CHECK-GI-NEXT: mov v5.b[0], v0.b[0]
+; CHECK-GI-NEXT: mov b6, v1.b[2]
+; CHECK-GI-NEXT: mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT: mov b2, v0.b[2]
+; CHECK-GI-NEXT: mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT: mov b4, v1.b[3]
+; CHECK-GI-NEXT: mov v3.b[2], v6.b[0]
+; CHECK-GI-NEXT: mov b6, v0.b[3]
+; CHECK-GI-NEXT: mov v5.b[2], v2.b[0]
+; CHECK-GI-NEXT: mov b2, v1.b[4]
+; CHECK-GI-NEXT: mov v3.b[3], v4.b[0]
+; CHECK-GI-NEXT: mov b4, v0.b[4]
+; CHECK-GI-NEXT: mov v5.b[3], v6.b[0]
+; CHECK-GI-NEXT: mov b6, v1.b[5]
+; CHECK-GI-NEXT: mov b1, v1.b[6]
+; CHECK-GI-NEXT: mov v3.b[4], v2.b[0]
+; CHECK-GI-NEXT: mov b2, v0.b[5]
+; CHECK-GI-NEXT: mov b0, v0.b[6]
+; CHECK-GI-NEXT: mov v5.b[4], v4.b[0]
+; CHECK-GI-NEXT: mov v3.b[5], v6.b[0]
+; CHECK-GI-NEXT: mov v5.b[5], v2.b[0]
+; CHECK-GI-NEXT: mov v3.b[6], v1.b[0]
+; CHECK-GI-NEXT: mov v5.b[6], v0.b[0]
+; CHECK-GI-NEXT: neg v0.8b, v3.8b
+; CHECK-GI-NEXT: sshl v0.8b, v5.8b, v0.8b
+; CHECK-GI-NEXT: mov b1, v0.b[1]
+; CHECK-GI-NEXT: mov b2, v0.b[2]
+; CHECK-GI-NEXT: mov b3, v0.b[3]
+; CHECK-GI-NEXT: mov b4, v0.b[4]
+; CHECK-GI-NEXT: mov b5, v0.b[6]
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov b1, v0.b[5]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: fmov w8, s2
+; CHECK-GI-NEXT: mov v0.h[2], w8
+; CHECK-GI-NEXT: fmov w8, s3
+; CHECK-GI-NEXT: mov v0.h[3], w8
+; CHECK-GI-NEXT: fmov w8, s4
+; CHECK-GI-NEXT: mov v0.h[4], w8
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov v0.h[5], w8
+; CHECK-GI-NEXT: fmov w8, s5
+; CHECK-GI-NEXT: mov v0.h[6], w8
+; CHECK-GI-NEXT: mov h1, v0.h[1]
+; CHECK-GI-NEXT: mov h2, v0.h[3]
+; CHECK-GI-NEXT: mov h3, v0.h[4]
+; CHECK-GI-NEXT: mov h4, v0.h[5]
+; CHECK-GI-NEXT: mov h5, v0.h[6]
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov h1, v0.h[2]
+; CHECK-GI-NEXT: mov v0.b[1], w8
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov v0.b[2], w8
+; CHECK-GI-NEXT: fmov w8, s2
+; CHECK-GI-NEXT: mov v0.b[3], w8
+; CHECK-GI-NEXT: fmov w8, s3
+; CHECK-GI-NEXT: mov v0.b[4], w8
+; CHECK-GI-NEXT: fmov w8, s4
+; CHECK-GI-NEXT: mov v0.b[5], w8
+; CHECK-GI-NEXT: fmov w8, s5
+; CHECK-GI-NEXT: mov v0.b[6], w8
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
%3 = ashr <7 x i8> %0, %1
ret <7 x i8> %3
}
define <3 x i16> @ashr_v3i16(<3 x i16> %0, <3 x i16> %1){
-; CHECK-LABEL: ashr_v3i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: neg v1.4h, v1.4h
-; CHECK-NEXT: sshl v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ashr_v3i16:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: neg v1.4h, v1.4h
+; CHECK-SD-NEXT: sshl v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ashr_v3i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT: mov v2.h[0], v1.h[0]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov v3.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v2.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v3.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v2.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v3.h[2], v0.h[2]
+; CHECK-GI-NEXT: neg v0.4h, v2.4h
+; CHECK-GI-NEXT: sshl v1.4h, v3.4h, v0.4h
+; CHECK-GI-NEXT: umov w8, v1.h[0]
+; CHECK-GI-NEXT: umov w9, v1.h[1]
+; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: umov w8, v1.h[2]
+; CHECK-GI-NEXT: mov v0.s[1], w9
+; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[2], w9
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
%3 = ashr <3 x i16> %0, %1
ret <3 x i16> %3
}
define <7 x i16> @ashr_v7i16(<7 x i16> %0, <7 x i16> %1){
-; CHECK-LABEL: ashr_v7i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: neg v1.8h, v1.8h
-; CHECK-NEXT: sshl v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ashr_v7i16:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: neg v1.8h, v1.8h
+; CHECK-SD-NEXT: sshl v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ashr_v7i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov v2.h[0], v1.h[0]
+; CHECK-GI-NEXT: mov v3.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v2.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v3.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v2.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v3.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v2.h[3], v1.h[3]
+; CHECK-GI-NEXT: mov v3.h[3], v0.h[3]
+; CHECK-GI-NEXT: mov v2.h[4], v1.h[4]
+; CHECK-GI-NEXT: mov v3.h[4], v0.h[4]
+; CHECK-GI-NEXT: mov v2.h[5], v1.h[5]
+; CHECK-GI-NEXT: mov v3.h[5], v0.h[5]
+; CHECK-GI-NEXT: mov v2.h[6], v1.h[6]
+; CHECK-GI-NEXT: mov v3.h[6], v0.h[6]
+; CHECK-GI-NEXT: neg v0.8h, v2.8h
+; CHECK-GI-NEXT: sshl v1.8h, v3.8h, v0.8h
+; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT: ret
%3 = ashr <7 x i16> %0, %1
ret <7 x i16> %3
}
define <3 x i32> @ashr_v3i32(<3 x i32> %0, <3 x i32> %1){
-; CHECK-LABEL: ashr_v3i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: neg v1.4s, v1.4s
-; CHECK-NEXT: sshl v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ashr_v3i32:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: neg v1.4s, v1.4s
+; CHECK-SD-NEXT: sshl v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ashr_v3i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov v2.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v3.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v3.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v2.s[2], v1.s[2]
+; CHECK-GI-NEXT: mov v3.s[2], v0.s[2]
+; CHECK-GI-NEXT: neg v0.4s, v2.4s
+; CHECK-GI-NEXT: sshl v1.4s, v3.4s, v0.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
%3 = ashr <3 x i32> %0, %1
ret <3 x i32> %3
}
@@ -1218,50 +1506,196 @@ define <3 x i8> @lshr_v3i8(<3 x i8> %0, <3 x i8> %1){
; CHECK-GI-NEXT: mov v1.b[2], w2
; CHECK-GI-NEXT: neg v0.8b, v0.8b
; CHECK-GI-NEXT: ushl v0.8b, v1.8b, v0.8b
-; CHECK-GI-NEXT: umov w0, v0.b[0]
-; CHECK-GI-NEXT: umov w1, v0.b[1]
-; CHECK-GI-NEXT: umov w2, v0.b[2]
+; CHECK-GI-NEXT: umov w8, v0.b[0]
+; CHECK-GI-NEXT: umov w9, v0.b[1]
+; CHECK-GI-NEXT: mov v1.s[0], w8
+; CHECK-GI-NEXT: umov w8, v0.b[2]
+; CHECK-GI-NEXT: mov v1.s[1], w9
+; CHECK-GI-NEXT: mov v1.s[2], w8
+; CHECK-GI-NEXT: mov s0, v1.s[1]
+; CHECK-GI-NEXT: mov s2, v1.s[2]
+; CHECK-GI-NEXT: fmov w0, s1
+; CHECK-GI-NEXT: fmov w1, s0
+; CHECK-GI-NEXT: fmov w2, s2
; CHECK-GI-NEXT: ret
%3 = lshr <3 x i8> %0, %1
ret <3 x i8> %3
}
define <7 x i8> @lshr_v7i8(<7 x i8> %0, <7 x i8> %1){
-; CHECK-LABEL: lshr_v7i8:
-; CHECK: // %bb.0:
-; CHECK-NEXT: neg v1.8b, v1.8b
-; CHECK-NEXT: ushl v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: lshr_v7i8:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: neg v1.8b, v1.8b
+; CHECK-SD-NEXT: ushl v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: lshr_v7i8:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT: mov b2, v1.b[1]
+; CHECK-GI-NEXT: mov v3.b[0], v1.b[0]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov b4, v0.b[1]
+; CHECK-GI-NEXT: mov v5.b[0], v0.b[0]
+; CHECK-GI-NEXT: mov b6, v1.b[2]
+; CHECK-GI-NEXT: mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT: mov b2, v0.b[2]
+; CHECK-GI-NEXT: mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT: mov b4, v1.b[3]
+; CHECK-GI-NEXT: mov v3.b[2], v6.b[0]
+; CHECK-GI-NEXT: mov b6, v0.b[3]
+; CHECK-GI-NEXT: mov v5.b[2], v2.b[0]
+; CHECK-GI-NEXT: mov b2, v1.b[4]
+; CHECK-GI-NEXT: mov v3.b[3], v4.b[0]
+; CHECK-GI-NEXT: mov b4, v0.b[4]
+; CHECK-GI-NEXT: mov v5.b[3], v6.b[0]
+; CHECK-GI-NEXT: mov b6, v1.b[5]
+; CHECK-GI-NEXT: mov b1, v1.b[6]
+; CHECK-GI-NEXT: mov v3.b[4], v2.b[0]
+; CHECK-GI-NEXT: mov b2, v0.b[5]
+; CHECK-GI-NEXT: mov b0, v0.b[6]
+; CHECK-GI-NEXT: mov v5.b[4], v4.b[0]
+; CHECK-GI-NEXT: mov v3.b[5], v6.b[0]
+; CHECK-GI-NEXT: mov v5.b[5], v2.b[0]
+; CHECK-GI-NEXT: mov v3.b[6], v1.b[0]
+; CHECK-GI-NEXT: mov v5.b[6], v0.b[0]
+; CHECK-GI-NEXT: neg v0.8b, v3.8b
+; CHECK-GI-NEXT: ushl v0.8b, v5.8b, v0.8b
+; CHECK-GI-NEXT: mov b1, v0.b[1]
+; CHECK-GI-NEXT: mov b2, v0.b[2]
+; CHECK-GI-NEXT: mov b3, v0.b[3]
+; CHECK-GI-NEXT: mov b4, v0.b[4]
+; CHECK-GI-NEXT: mov b5, v0.b[6]
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov b1, v0.b[5]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: fmov w8, s2
+; CHECK-GI-NEXT: mov v0.h[2], w8
+; CHECK-GI-NEXT: fmov w8, s3
+; CHECK-GI-NEXT: mov v0.h[3], w8
+; CHECK-GI-NEXT: fmov w8, s4
+; CHECK-GI-NEXT: mov v0.h[4], w8
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov v0.h[5], w8
+; CHECK-GI-NEXT: fmov w8, s5
+; CHECK-GI-NEXT: mov v0.h[6], w8
+; CHECK-GI-NEXT: mov h1, v0.h[1]
+; CHECK-GI-NEXT: mov h2, v0.h[3]
+; CHECK-GI-NEXT: mov h3, v0.h[4]
+; CHECK-GI-NEXT: mov h4, v0.h[5]
+; CHECK-GI-NEXT: mov h5, v0.h[6]
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov h1, v0.h[2]
+; CHECK-GI-NEXT: mov v0.b[1], w8
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov v0.b[2], w8
+; CHECK-GI-NEXT: fmov w8, s2
+; CHECK-GI-NEXT: mov v0.b[3], w8
+; CHECK-GI-NEXT: fmov w8, s3
+; CHECK-GI-NEXT: mov v0.b[4], w8
+; CHECK-GI-NEXT: fmov w8, s4
+; CHECK-GI-NEXT: mov v0.b[5], w8
+; CHECK-GI-NEXT: fmov w8, s5
+; CHECK-GI-NEXT: mov v0.b[6], w8
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
%3 = lshr <7 x i8> %0, %1
ret <7 x i8> %3
}
define <3 x i16> @lshr_v3i16(<3 x i16> %0, <3 x i16> %1){
-; CHECK-LABEL: lshr_v3i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: neg v1.4h, v1.4h
-; CHECK-NEXT: ushl v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: lshr_v3i16:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: neg v1.4h, v1.4h
+; CHECK-SD-NEXT: ushl v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: lshr_v3i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT: mov v2.h[0], v1.h[0]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov v3.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v2.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v3.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v2.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v3.h[2], v0.h[2]
+; CHECK-GI-NEXT: neg v0.4h, v2.4h
+; CHECK-GI-NEXT: ushl v1.4h, v3.4h, v0.4h
+; CHECK-GI-NEXT: umov w8, v1.h[0]
+; CHECK-GI-NEXT: umov w9, v1.h[1]
+; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: umov w8, v1.h[2]
+; CHECK-GI-NEXT: mov v0.s[1], w9
+; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[2], w9
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
%3 = lshr <3 x i16> %0, %1
ret <3 x i16> %3
}
define <7 x i16> @lshr_v7i16(<7 x i16> %0, <7 x i16> %1){
-; CHECK-LABEL: lshr_v7i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: neg v1.8h, v1.8h
-; CHECK-NEXT: ushl v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: lshr_v7i16:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: neg v1.8h, v1.8h
+; CHECK-SD-NEXT: ushl v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: lshr_v7i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov v2.h[0], v1.h[0]
+; CHECK-GI-NEXT: mov v3.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v2.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v3.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v2.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v3.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v2.h[3], v1.h[3]
+; CHECK-GI-NEXT: mov v3.h[3], v0.h[3]
+; CHECK-GI-NEXT: mov v2.h[4], v1.h[4]
+; CHECK-GI-NEXT: mov v3.h[4], v0.h[4]
+; CHECK-GI-NEXT: mov v2.h[5], v1.h[5]
+; CHECK-GI-NEXT: mov v3.h[5], v0.h[5]
+; CHECK-GI-NEXT: mov v2.h[6], v1.h[6]
+; CHECK-GI-NEXT: mov v3.h[6], v0.h[6]
+; CHECK-GI-NEXT: neg v0.8h, v2.8h
+; CHECK-GI-NEXT: ushl v1.8h, v3.8h, v0.8h
+; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
+; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT: ret
%3 = lshr <7 x i16> %0, %1
ret <7 x i16> %3
}
define <3 x i32> @lshr_v3i32(<3 x i32> %0, <3 x i32> %1){
-; CHECK-LABEL: lshr_v3i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: neg v1.4s, v1.4s
-; CHECK-NEXT: ushl v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: lshr_v3i32:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: neg v1.4s, v1.4s
+; CHECK-SD-NEXT: ushl v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: lshr_v3i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov v2.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v3.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v3.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v2.s[2], v1.s[2]
+; CHECK-GI-NEXT: mov v3.s[2], v0.s[2]
+; CHECK-GI-NEXT: neg v0.4s, v2.4s
+; CHECK-GI-NEXT: ushl v1.4s, v3.4s, v0.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
%3 = lshr <3 x i32> %0, %1
ret <3 x i32> %3
}
diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll
index 6b5951551c3a54..db0fd4293e084b 100644
--- a/llvm/test/CodeGen/AArch64/shufflevector.ll
+++ b/llvm/test/CodeGen/AArch64/shufflevector.ll
@@ -322,10 +322,17 @@ define <16 x i16> @shufflevector_v16i16(<16 x i16> %a, <16 x i16> %b){
}
define <1 x i32> @shufflevector_v1i32(<1 x i32> %a, <1 x i32> %b) {
-; CHECK-LABEL: shufflevector_v1i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: fmov d0, d1
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: shufflevector_v1i32:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: fmov d0, d1
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: shufflevector_v1i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
%c = shufflevector <1 x i32> %a, <1 x i32> %b, <1 x i32> <i32 1>
ret <1 x i32> %c
}
@@ -464,9 +471,16 @@ define <16 x i16> @shufflevector_v16i16_zeroes(<16 x i16> %a, <16 x i16> %b){
}
define <1 x i32> @shufflevector_v1i32_zeroes(<1 x i32> %a, <1 x i32> %b) {
-; CHECK-LABEL: shufflevector_v1i32_zeroes:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: shufflevector_v1i32_zeroes:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: shufflevector_v1i32_zeroes:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov v0.s[0], v0.s[0]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
%c = shufflevector <1 x i32> %a, <1 x i32> %b, <1 x i32> <i32 0>
ret <1 x i32> %c
}
@@ -503,19 +517,14 @@ define <3 x i8> @shufflevector_v3i8(<3 x i8> %a, <3 x i8> %b) {
;
; CHECK-GI-LABEL: shufflevector_v3i8:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: fmov s1, w3
-; CHECK-GI-NEXT: adrp x8, .LCPI30_0
-; CHECK-GI-NEXT: mov v0.b[1], w1
-; CHECK-GI-NEXT: mov v1.b[1], w4
-; CHECK-GI-NEXT: mov v0.b[2], w2
-; CHECK-GI-NEXT: mov v1.b[2], w5
-; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI30_0]
-; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b
-; CHECK-GI-NEXT: umov w0, v0.b[0]
-; CHECK-GI-NEXT: umov w1, v0.b[1]
-; CHECK-GI-NEXT: umov w2, v0.b[2]
+; CHECK-GI-NEXT: mov v0.s[0], w0
+; CHECK-GI-NEXT: mov v0.s[1], w1
+; CHECK-GI-NEXT: mov v0.s[2], w2
+; CHECK-GI-NEXT: mov w2, w4
+; CHECK-GI-NEXT: mov s1, v0.s[1]
+; CHECK-GI-NEXT: mov s0, v0.s[2]
+; CHECK-GI-NEXT: fmov w0, s1
+; CHECK-GI-NEXT: fmov w1, s0
; CHECK-GI-NEXT: ret
%c = shufflevector <3 x i8> %a, <3 x i8> %b, <3 x i32> <i32 1, i32 2, i32 4>
ret <3 x i8> %c
@@ -535,12 +544,62 @@ define <7 x i8> @shufflevector_v7i8(<7 x i8> %a, <7 x i8> %b) {
; CHECK-GI-LABEL: shufflevector_v7i8:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov b2, v0.b[1]
+; CHECK-GI-NEXT: mov b3, v0.b[2]
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: adrp x8, .LCPI31_0
-; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI31_0]
-; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: mov b4, v0.b[3]
+; CHECK-GI-NEXT: mov b5, v0.b[4]
+; CHECK-GI-NEXT: mov b6, v0.b[5]
+; CHECK-GI-NEXT: mov b7, v1.b[3]
+; CHECK-GI-NEXT: mov b16, v1.b[4]
+; CHECK-GI-NEXT: mov b17, v1.b[5]
+; CHECK-GI-NEXT: fmov w8, s2
+; CHECK-GI-NEXT: mov b2, v0.b[6]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: fmov w8, s3
+; CHECK-GI-NEXT: mov b3, v1.b[1]
+; CHECK-GI-NEXT: mov v0.h[2], w8
+; CHECK-GI-NEXT: fmov w8, s4
+; CHECK-GI-NEXT: mov b4, v1.b[2]
+; CHECK-GI-NEXT: fmov w9, s3
+; CHECK-GI-NEXT: mov b3, v1.b[6]
+; CHECK-GI-NEXT: mov v0.h[3], w8
+; CHECK-GI-NEXT: fmov w8, s5
+; CHECK-GI-NEXT: mov v1.h[1], w9
+; CHECK-GI-NEXT: fmov w9, s4
+; CHECK-GI-NEXT: mov v0.h[4], w8
+; CHECK-GI-NEXT: fmov w8, s6
+; CHECK-GI-NEXT: mov v1.h[2], w9
+; CHECK-GI-NEXT: fmov w9, s7
+; CHECK-GI-NEXT: mov v0.h[5], w8
+; CHECK-GI-NEXT: fmov w8, s2
+; CHECK-GI-NEXT: mov v1.h[3], w9
+; CHECK-GI-NEXT: mov v0.h[6], w8
+; CHECK-GI-NEXT: fmov w8, s16
+; CHECK-GI-NEXT: mov v1.h[4], w8
+; CHECK-GI-NEXT: fmov w8, s17
+; CHECK-GI-NEXT: mov h4, v0.h[3]
+; CHECK-GI-NEXT: mov h2, v0.h[1]
+; CHECK-GI-NEXT: mov h0, v0.h[5]
+; CHECK-GI-NEXT: mov v1.h[5], w8
+; CHECK-GI-NEXT: fmov w8, s3
+; CHECK-GI-NEXT: fmov w9, s4
+; CHECK-GI-NEXT: mov v2.b[1], w9
+; CHECK-GI-NEXT: fmov w9, s0
+; CHECK-GI-NEXT: mov v1.h[6], w8
+; CHECK-GI-NEXT: mov v2.b[2], w9
+; CHECK-GI-NEXT: mov h0, v1.h[1]
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov h3, v1.h[3]
+; CHECK-GI-NEXT: mov v2.b[3], w8
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: mov h0, v1.h[5]
+; CHECK-GI-NEXT: mov v2.b[4], w8
+; CHECK-GI-NEXT: fmov w8, s3
+; CHECK-GI-NEXT: mov v2.b[5], w8
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: mov v2.b[6], w8
+; CHECK-GI-NEXT: fmov d0, d2
; CHECK-GI-NEXT: ret
%c = shufflevector <7 x i8> %a, <7 x i8> %b, <7 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12>
ret <7 x i8> %c
@@ -556,11 +615,18 @@ define <3 x i16> @shufflevector_v3i16(<3 x i16> %a, <3 x i16> %b) {
; CHECK-GI-LABEL: shufflevector_v3i16:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: umov w8, v0.h[0]
+; CHECK-GI-NEXT: umov w9, v0.h[1]
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: adrp x8, .LCPI32_0
-; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI32_0]
-; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-GI-NEXT: mov v2.s[0], w8
+; CHECK-GI-NEXT: umov w8, v0.h[2]
+; CHECK-GI-NEXT: mov v2.s[1], w9
+; CHECK-GI-NEXT: mov v2.s[2], w8
+; CHECK-GI-NEXT: mov w8, v2.s[1]
+; CHECK-GI-NEXT: mov w9, v2.s[2]
+; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: mov v0.h[1], w9
+; CHECK-GI-NEXT: mov v0.h[2], v1.h[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
%c = shufflevector <3 x i16> %a, <3 x i16> %b, <3 x i32> <i32 1, i32 2, i32 4>
@@ -579,11 +645,27 @@ define <7 x i16> @shufflevector_v7i16(<7 x i16> %a, <7 x i16> %b) {
;
; CHECK-GI-LABEL: shufflevector_v7i16:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: adrp x8, .LCPI33_0
-; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI33_0]
-; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-GI-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v2.h[3], v0.h[3]
+; CHECK-GI-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-GI-NEXT: mov v2.h[4], v0.h[4]
+; CHECK-GI-NEXT: mov v3.h[4], v1.h[4]
+; CHECK-GI-NEXT: mov v2.h[5], v0.h[5]
+; CHECK-GI-NEXT: mov v3.h[5], v1.h[5]
+; CHECK-GI-NEXT: mov v2.h[6], v0.h[6]
+; CHECK-GI-NEXT: mov v3.h[6], v1.h[6]
+; CHECK-GI-NEXT: mov v0.h[0], v2.h[1]
+; CHECK-GI-NEXT: mov v0.h[1], v2.h[3]
+; CHECK-GI-NEXT: mov v0.h[2], v2.h[5]
+; CHECK-GI-NEXT: mov v0.h[3], v3.h[0]
+; CHECK-GI-NEXT: mov v0.h[4], v3.h[1]
+; CHECK-GI-NEXT: mov v0.h[5], v3.h[3]
+; CHECK-GI-NEXT: mov v0.h[6], v3.h[5]
; CHECK-GI-NEXT: ret
%c = shufflevector <7 x i16> %a, <7 x i16> %b, <7 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12>
ret <7 x i16> %c
@@ -598,11 +680,12 @@ define <3 x i32> @shufflevector_v3i32(<3 x i32> %a, <3 x i32> %b) {
;
; CHECK-GI-LABEL: shufflevector_v3i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: adrp x8, .LCPI34_0
-; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI34_0]
-; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v0.s[0], v2.s[1]
+; CHECK-GI-NEXT: mov v0.s[1], v2.s[2]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[1]
; CHECK-GI-NEXT: ret
%c = shufflevector <3 x i32> %a, <3 x i32> %b, <3 x i32> <i32 1, i32 2, i32 4>
ret <3 x i32> %c
@@ -619,52 +702,130 @@ define <3 x i8> @shufflevector_v3i8_zeroes(<3 x i8> %a, <3 x i8> %b) {
;
; CHECK-GI-LABEL: shufflevector_v3i8_zeroes:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: mov v0.b[1], w1
-; CHECK-GI-NEXT: mov v0.b[2], w2
-; CHECK-GI-NEXT: dup v0.8b, v0.b[0]
-; CHECK-GI-NEXT: umov w0, v0.b[0]
-; CHECK-GI-NEXT: umov w1, v0.b[1]
-; CHECK-GI-NEXT: umov w2, v0.b[2]
+; CHECK-GI-NEXT: mov v0.s[0], w0
+; CHECK-GI-NEXT: mov v0.s[1], w1
+; CHECK-GI-NEXT: mov v0.s[2], w2
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: fmov w1, s0
+; CHECK-GI-NEXT: fmov w2, s0
; CHECK-GI-NEXT: ret
%c = shufflevector <3 x i8> %a, <3 x i8> %b, <3 x i32> <i32 0, i32 0, i32 0>
ret <3 x i8> %c
}
define <7 x i8> @shufflevector_v7i8_zeroes(<7 x i8> %a, <7 x i8> %b) {
-; CHECK-LABEL: shufflevector_v7i8_zeroes:
-; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: dup v0.8b, v0.b[0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: shufflevector_v7i8_zeroes:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: dup v0.8b, v0.b[0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: shufflevector_v7i8_zeroes:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov b1, v0.b[1]
+; CHECK-GI-NEXT: mov b2, v0.b[2]
+; CHECK-GI-NEXT: mov b3, v0.b[3]
+; CHECK-GI-NEXT: mov b4, v0.b[4]
+; CHECK-GI-NEXT: mov b5, v0.b[6]
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov b1, v0.b[5]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: fmov w8, s2
+; CHECK-GI-NEXT: mov v0.h[2], w8
+; CHECK-GI-NEXT: fmov w8, s3
+; CHECK-GI-NEXT: mov v0.h[3], w8
+; CHECK-GI-NEXT: fmov w8, s4
+; CHECK-GI-NEXT: mov v0.h[4], w8
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov v0.h[5], w8
+; CHECK-GI-NEXT: fmov w8, s5
+; CHECK-GI-NEXT: mov v0.h[6], w8
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: fmov w9, s0
+; CHECK-GI-NEXT: fmov w10, s0
+; CHECK-GI-NEXT: fmov w11, s0
+; CHECK-GI-NEXT: fmov w12, s0
+; CHECK-GI-NEXT: fmov w13, s0
+; CHECK-GI-NEXT: mov v0.b[1], w8
+; CHECK-GI-NEXT: mov v0.b[2], w9
+; CHECK-GI-NEXT: mov v0.b[3], w10
+; CHECK-GI-NEXT: mov v0.b[4], w11
+; CHECK-GI-NEXT: mov v0.b[5], w12
+; CHECK-GI-NEXT: mov v0.b[6], w13
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
%c = shufflevector <7 x i8> %a, <7 x i8> %b, <7 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <7 x i8> %c
}
define <3 x i16> @shufflevector_v3i16_zeroes(<3 x i16> %a, <3 x i16> %b) {
-; CHECK-LABEL: shufflevector_v3i16_zeroes:
-; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: dup v0.4h, v0.h[0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: shufflevector_v3i16_zeroes:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: dup v0.4h, v0.h[0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: shufflevector_v3i16_zeroes:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: umov w8, v0.h[0]
+; CHECK-GI-NEXT: umov w9, v0.h[1]
+; CHECK-GI-NEXT: mov v1.s[0], w8
+; CHECK-GI-NEXT: umov w8, v0.h[2]
+; CHECK-GI-NEXT: mov v1.s[1], w9
+; CHECK-GI-NEXT: mov v1.s[2], w8
+; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
%c = shufflevector <3 x i16> %a, <3 x i16> %b, <3 x i32> <i32 0, i32 0, i32 0>
ret <3 x i16> %c
}
define <7 x i16> @shufflevector_v7i16_zeroes(<7 x i16> %a, <7 x i16> %b) {
-; CHECK-LABEL: shufflevector_v7i16_zeroes:
-; CHECK: // %bb.0:
-; CHECK-NEXT: dup v0.8h, v0.h[0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: shufflevector_v7i16_zeroes:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: dup v0.8h, v0.h[0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: shufflevector_v7i16_zeroes:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT: mov v1.h[4], v0.h[4]
+; CHECK-GI-NEXT: mov v1.h[5], v0.h[5]
+; CHECK-GI-NEXT: mov v1.h[6], v0.h[6]
+; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[3], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[4], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[5], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[6], v1.h[0]
+; CHECK-GI-NEXT: ret
%c = shufflevector <7 x i16> %a, <7 x i16> %b, <7 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <7 x i16> %c
}
define <3 x i32> @shufflevector_v3i32_zeroes(<3 x i32> %a, <3 x i32> %b) {
-; CHECK-LABEL: shufflevector_v3i32_zeroes:
-; CHECK: // %bb.0:
-; CHECK-NEXT: dup v0.4s, v0.s[0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: shufflevector_v3i32_zeroes:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: dup v0.4s, v0.s[0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: shufflevector_v3i32_zeroes:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT: ret
%c = shufflevector <3 x i32> %a, <3 x i32> %b, <3 x i32> <i32 0, i32 0, i32 0>
ret <3 x i32> %c
}
diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll
index 8f35a69f52b85b..8cd1bcfb82dcc3 100644
--- a/llvm/test/CodeGen/AArch64/sub.ll
+++ b/llvm/test/CodeGen/AArch64/sub.ll
@@ -343,10 +343,24 @@ entry:
}
define <3 x i32> @v3i32(<3 x i32> %d, <3 x i32> %e) {
-; CHECK-LABEL: v3i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: v3i32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: sub v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: v3i32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v3.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v3.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v2.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v3.s[2], v1.s[2]
+; CHECK-GI-NEXT: sub v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
+; CHECK-GI-NEXT: ret
entry:
%s = sub <3 x i32> %d, %e
ret <3 x i32> %s
@@ -408,8 +422,9 @@ define <3 x i64> @v3i64(<3 x i64> %d, <3 x i64> %e) {
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: mov v3.d[1], v4.d[0]
; CHECK-GI-NEXT: sub x8, x8, x9
-; CHECK-GI-NEXT: fmov d2, x8
+; CHECK-GI-NEXT: mov v2.d[0], x8
; CHECK-GI-NEXT: sub v0.2d, v0.2d, v3.2d
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
index d71aed2d17506b..69fd0ad01b7c5b 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
@@ -187,12 +187,22 @@ define i8 @test_v9i8(<9 x i8> %a) nounwind {
}
define i32 @test_v3i32(<3 x i32> %a) nounwind {
-; CHECK-LABEL: test_v3i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mov v0.s[3], wzr
-; CHECK-NEXT: umaxv s0, v0.4s
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_v3i32:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: mov v0.s[3], wzr
+; CHECK-SD-NEXT: umaxv s0, v0.4s
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_v3i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-GI-NEXT: mov v1.s[3], wzr
+; CHECK-GI-NEXT: umaxv s0, v1.4s
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
%b = call i32 @llvm.vector.reduce.umax.v3i32(<3 x i32> %a)
ret i32 %b
}
diff --git a/llvm/test/CodeGen/AArch64/xtn.ll b/llvm/test/CodeGen/AArch64/xtn.ll
index 8a4d6b8c7b789f..96474a84ca9924 100644
--- a/llvm/test/CodeGen/AArch64/xtn.ll
+++ b/llvm/test/CodeGen/AArch64/xtn.ll
@@ -293,10 +293,19 @@ entry:
}
define <3 x i16> @xtn_v3i32_v3i16(<3 x i32> %a) {
-; CHECK-LABEL: xtn_v3i32_v3i16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: xtn v0.4h, v0.4s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: xtn_v3i32_v3i16:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: xtn v0.4h, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: xtn_v3i32_v3i16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[2], w9
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
entry:
%arg1 = trunc <3 x i32> %a to <3 x i16>
ret <3 x i16> %arg1
diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll
index 0d5010113ce0b2..2e979bb1225601 100644
--- a/llvm/test/CodeGen/AArch64/zext.ll
+++ b/llvm/test/CodeGen/AArch64/zext.ll
@@ -243,11 +243,15 @@ define <3 x i16> @zext_v3i8_v3i16(<3 x i8> %a) {
; CHECK-GI-LABEL: zext_v3i8_v3i16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: and w8, w0, #0xff
-; CHECK-GI-NEXT: and w9, w1, #0xff
-; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: and w8, w1, #0xff
+; CHECK-GI-NEXT: mov v0.s[1], w8
; CHECK-GI-NEXT: and w8, w2, #0xff
-; CHECK-GI-NEXT: mov v0.h[1], w9
-; CHECK-GI-NEXT: mov v0.h[2], w8
+; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[2], w9
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
@@ -269,11 +273,14 @@ define <3 x i32> @zext_v3i8_v3i32(<3 x i8> %a) {
; CHECK-GI-LABEL: zext_v3i8_v3i32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: and w8, w0, #0xff
-; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: mov v1.s[0], w8
; CHECK-GI-NEXT: and w8, w1, #0xff
-; CHECK-GI-NEXT: mov v0.s[1], w8
+; CHECK-GI-NEXT: mov v1.s[1], w8
; CHECK-GI-NEXT: and w8, w2, #0xff
-; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: mov v1.s[2], w8
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: ret
entry:
%c = zext <3 x i8> %a to <3 x i32>
@@ -301,14 +308,17 @@ define <3 x i64> @zext_v3i8_v3i64(<3 x i8> %a) {
; CHECK-GI-LABEL: zext_v3i8_v3i64:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-GI-NEXT: and x8, x0, #0xff
; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2
-; CHECK-GI-NEXT: and x8, x0, #0xff
-; CHECK-GI-NEXT: and x9, x1, #0xff
-; CHECK-GI-NEXT: and x10, x2, #0xff
-; CHECK-GI-NEXT: fmov d0, x8
-; CHECK-GI-NEXT: fmov d1, x9
-; CHECK-GI-NEXT: fmov d2, x10
+; CHECK-GI-NEXT: mov v0.d[0], x8
+; CHECK-GI-NEXT: and x8, x1, #0xff
+; CHECK-GI-NEXT: mov v0.d[1], x8
+; CHECK-GI-NEXT: and x8, x2, #0xff
+; CHECK-GI-NEXT: mov v2.d[0], x8
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT: mov d1, v0.d[1]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
%c = zext <3 x i8> %a to <3 x i64>
@@ -330,7 +340,9 @@ define <3 x i32> @zext_v3i16_v3i32(<3 x i16> %a) {
; CHECK-GI-NEXT: umov w8, v0.h[2]
; CHECK-GI-NEXT: mov v1.s[1], w9
; CHECK-GI-NEXT: mov v1.s[2], w8
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: ret
entry:
%c = zext <3 x i16> %a to <3 x i32>
@@ -354,10 +366,13 @@ define <3 x i64> @zext_v3i16_v3i64(<3 x i16> %a) {
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: umov w8, v0.h[0]
; CHECK-GI-NEXT: umov w9, v0.h[1]
-; CHECK-GI-NEXT: umov w10, v0.h[2]
-; CHECK-GI-NEXT: fmov d0, x8
-; CHECK-GI-NEXT: fmov d1, x9
-; CHECK-GI-NEXT: fmov d2, x10
+; CHECK-GI-NEXT: mov v3.d[0], x8
+; CHECK-GI-NEXT: umov w8, v0.h[2]
+; CHECK-GI-NEXT: mov v3.d[1], x9
+; CHECK-GI-NEXT: mov v2.d[0], x8
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT: mov d1, v3.d[1]
+; CHECK-GI-NEXT: fmov d0, d3
; CHECK-GI-NEXT: ret
entry:
%c = zext <3 x i16> %a to <3 x i64>
@@ -379,10 +394,13 @@ define <3 x i64> @zext_v3i32_v3i64(<3 x i32> %a) {
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov w8, v0.s[0]
; CHECK-GI-NEXT: mov w9, v0.s[1]
-; CHECK-GI-NEXT: mov w10, v0.s[2]
-; CHECK-GI-NEXT: fmov d0, x8
-; CHECK-GI-NEXT: fmov d1, x9
-; CHECK-GI-NEXT: fmov d2, x10
+; CHECK-GI-NEXT: mov v3.d[0], x8
+; CHECK-GI-NEXT: mov w8, v0.s[2]
+; CHECK-GI-NEXT: mov v3.d[1], x9
+; CHECK-GI-NEXT: mov v2.d[0], x8
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT: mov d1, v3.d[1]
+; CHECK-GI-NEXT: fmov d0, d3
; CHECK-GI-NEXT: ret
entry:
%c = zext <3 x i32> %a to <3 x i64>
@@ -402,11 +420,15 @@ define <3 x i16> @zext_v3i10_v3i16(<3 x i10> %a) {
; CHECK-GI-LABEL: zext_v3i10_v3i16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: and w8, w0, #0x3ff
-; CHECK-GI-NEXT: and w9, w1, #0x3ff
-; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: and w8, w1, #0x3ff
+; CHECK-GI-NEXT: mov v0.s[1], w8
; CHECK-GI-NEXT: and w8, w2, #0x3ff
-; CHECK-GI-NEXT: mov v0.h[1], w9
-; CHECK-GI-NEXT: mov v0.h[2], w8
+; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[2], w9
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
@@ -428,11 +450,14 @@ define <3 x i32> @zext_v3i10_v3i32(<3 x i10> %a) {
; CHECK-GI-LABEL: zext_v3i10_v3i32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: and w8, w0, #0x3ff
-; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: mov v1.s[0], w8
; CHECK-GI-NEXT: and w8, w1, #0x3ff
-; CHECK-GI-NEXT: mov v0.s[1], w8
+; CHECK-GI-NEXT: mov v1.s[1], w8
; CHECK-GI-NEXT: and w8, w2, #0x3ff
-; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: mov v1.s[2], w8
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[2]
; CHECK-GI-NEXT: ret
entry:
%c = zext <3 x i10> %a to <3 x i32>
@@ -459,14 +484,17 @@ define <3 x i64> @zext_v3i10_v3i64(<3 x i10> %a) {
; CHECK-GI-LABEL: zext_v3i10_v3i64:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-GI-NEXT: and x8, x0, #0x3ff
; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2
-; CHECK-GI-NEXT: and x8, x0, #0x3ff
-; CHECK-GI-NEXT: and x9, x1, #0x3ff
-; CHECK-GI-NEXT: and x10, x2, #0x3ff
-; CHECK-GI-NEXT: fmov d0, x8
-; CHECK-GI-NEXT: fmov d1, x9
-; CHECK-GI-NEXT: fmov d2, x10
+; CHECK-GI-NEXT: mov v0.d[0], x8
+; CHECK-GI-NEXT: and x8, x1, #0x3ff
+; CHECK-GI-NEXT: mov v0.d[1], x8
+; CHECK-GI-NEXT: and x8, x2, #0x3ff
+; CHECK-GI-NEXT: mov v2.d[0], x8
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT: mov d1, v0.d[1]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
%c = zext <3 x i10> %a to <3 x i64>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
index c8b82716a9fe13..74f259d7cd4cca 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
@@ -9,8 +9,13 @@ define <2 x i16> @v_add_v2i16(<2 x i16> %a, <2 x i16> %b) {
; GFX7-LABEL: v_add_v2i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_add_v2i16:
@@ -45,8 +50,13 @@ define <2 x i16> @v_add_v2i16_fneg_lhs(<2 x half> %a, <2 x i16> %b) {
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_add_v2i16_fneg_lhs:
@@ -84,8 +94,13 @@ define <2 x i16> @v_add_v2i16_fneg_rhs(<2 x i16> %a, <2 x half> %b) {
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_add_v2i16_fneg_rhs:
@@ -130,6 +145,11 @@ define <2 x i16> @v_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) {
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs:
@@ -165,8 +185,13 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) {
; GFX7-LABEL: v_add_v2i16_neg_inline_imm_splat:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc0, v0
; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xffffffc0, v1
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc0, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_add_v2i16_neg_inline_imm_splat:
@@ -197,8 +222,13 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_lo(<2 x i16> %a) {
; GFX7-LABEL: v_add_v2i16_neg_inline_imm_lo:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc0, v0
; GFX7-NEXT: v_add_i32_e32 v1, vcc, 4, v1
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc0, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_add_v2i16_neg_inline_imm_lo:
@@ -230,8 +260,13 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) {
; GFX7-LABEL: v_add_v2i16_neg_inline_imm_hi:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v0
; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xffffffc0, v1
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_add_v2i16_neg_inline_imm_hi:
@@ -614,6 +649,11 @@ define <2 x i16> @add_inline_imm_neg1_0(<2 x i16> %x) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, -1, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: add_inline_imm_neg1_0:
@@ -645,6 +685,11 @@ define <2 x i16> @add_inline_imm_1_0(<2 x i16> %x) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: add_inline_imm_1_0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index 63f5464371cc62..aba7ded8fe17f1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -753,6 +753,11 @@ define <2 x i16> @v_ashr_v2i16(<2 x i16> %value, <2 x i16> %amount) {
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_ashrrev_i32_e32 v1, v2, v1
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ashr_v2i16:
@@ -782,10 +787,15 @@ define <2 x i16> @v_ashr_v2i16_15(<2 x i16> %value) {
; GFX6-LABEL: v_ashr_v2i16_15:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX6-NEXT: v_ashrrev_i32_e32 v0, 15, v0
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v1
+; GFX6-NEXT: v_ashrrev_i32_e32 v0, 15, v0
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ashr_v2i16_15:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
index 132dc876b3b054..b026fdb755c00f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
@@ -566,6 +566,11 @@ define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) {
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v1
; GFX7-NEXT: v_bfe_u32 v1, v1, 8, 8
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_bswap_v2i16:
@@ -609,6 +614,10 @@ define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) {
; GFX8-LABEL: v_bswap_v3i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX8-NEXT: s_mov_b32 s4, 0x2030001
; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4
; GFX8-NEXT: v_perm_b32 v1, 0, v1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir
index 42e53bedb8d857..26e8fe2c9a27c1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir
@@ -838,11 +838,18 @@ body: |
; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = reassoc G_FADD [[FMUL]], [[BUILD_VECTOR2]]
- ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX9-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C]](s32)
+ ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX9-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C1]](s32)
+ ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX9-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C2]](s32)
+ ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+ ; GFX9-NEXT: [[EVEC3:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[EVEC3]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX9-CONTRACT-LABEL: name: test_4xfloat_add_mul
@@ -864,11 +871,18 @@ body: |
; GFX9-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-CONTRACT-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX9-CONTRACT-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C]](s32)
+ ; GFX9-CONTRACT-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX9-CONTRACT-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C1]](s32)
+ ; GFX9-CONTRACT-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX9-CONTRACT-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C2]](s32)
+ ; GFX9-CONTRACT-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+ ; GFX9-CONTRACT-NEXT: [[EVEC3:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C3]](s32)
+ ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+ ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+ ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
+ ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[EVEC3]](s32)
; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX9-DENORM-LABEL: name: test_4xfloat_add_mul
@@ -891,11 +905,18 @@ body: |
; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = reassoc G_FADD [[FMUL]], [[BUILD_VECTOR2]]
- ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
- ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-DENORM-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX9-DENORM-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C]](s32)
+ ; GFX9-DENORM-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX9-DENORM-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C1]](s32)
+ ; GFX9-DENORM-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX9-DENORM-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C2]](s32)
+ ; GFX9-DENORM-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+ ; GFX9-DENORM-NEXT: [[EVEC3:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C3]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[EVEC3]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX9-UNSAFE-LABEL: name: test_4xfloat_add_mul
@@ -917,11 +938,18 @@ body: |
; GFX9-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-UNSAFE-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX9-UNSAFE-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C]](s32)
+ ; GFX9-UNSAFE-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX9-UNSAFE-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C1]](s32)
+ ; GFX9-UNSAFE-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX9-UNSAFE-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C2]](s32)
+ ; GFX9-UNSAFE-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+ ; GFX9-UNSAFE-NEXT: [[EVEC3:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C3]](s32)
+ ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+ ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+ ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
+ ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[EVEC3]](s32)
; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: test_4xfloat_add_mul
@@ -944,11 +972,18 @@ body: |
; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = reassoc G_FADD [[FMUL]], [[BUILD_VECTOR2]]
- ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C]](s32)
+ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C1]](s32)
+ ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C2]](s32)
+ ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+ ; GFX10-NEXT: [[EVEC3:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[EVEC3]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-CONTRACT-LABEL: name: test_4xfloat_add_mul
@@ -970,11 +1005,18 @@ body: |
; GFX10-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-CONTRACT-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-CONTRACT-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C]](s32)
+ ; GFX10-CONTRACT-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-CONTRACT-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C1]](s32)
+ ; GFX10-CONTRACT-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-CONTRACT-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C2]](s32)
+ ; GFX10-CONTRACT-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+ ; GFX10-CONTRACT-NEXT: [[EVEC3:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C3]](s32)
+ ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+ ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+ ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
+ ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[EVEC3]](s32)
; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-DENORM-LABEL: name: test_4xfloat_add_mul
@@ -997,11 +1039,18 @@ body: |
; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = reassoc G_FADD [[FMUL]], [[BUILD_VECTOR2]]
- ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
- ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-DENORM-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-DENORM-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C]](s32)
+ ; GFX10-DENORM-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-DENORM-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C1]](s32)
+ ; GFX10-DENORM-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-DENORM-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C2]](s32)
+ ; GFX10-DENORM-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+ ; GFX10-DENORM-NEXT: [[EVEC3:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C3]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[EVEC3]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-UNSAFE-LABEL: name: test_4xfloat_add_mul
@@ -1023,11 +1072,18 @@ body: |
; GFX10-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-UNSAFE-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-UNSAFE-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C]](s32)
+ ; GFX10-UNSAFE-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-UNSAFE-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C1]](s32)
+ ; GFX10-UNSAFE-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-UNSAFE-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C2]](s32)
+ ; GFX10-UNSAFE-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+ ; GFX10-UNSAFE-NEXT: [[EVEC3:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C3]](s32)
+ ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+ ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+ ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
+ ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[EVEC3]](s32)
; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%4:_(s32) = COPY $vgpr0
%5:_(s32) = COPY $vgpr1
@@ -1077,10 +1133,15 @@ body: |
; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
- ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX9-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C]](s32)
+ ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX9-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C1]](s32)
+ ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX9-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C2]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
;
; GFX9-CONTRACT-LABEL: name: test_3xfloat_add_mul_rhs
@@ -1099,10 +1160,15 @@ body: |
; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-CONTRACT-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX9-CONTRACT-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C]](s32)
+ ; GFX9-CONTRACT-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX9-CONTRACT-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C1]](s32)
+ ; GFX9-CONTRACT-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX9-CONTRACT-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C2]](s32)
+ ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+ ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+ ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
;
; GFX9-DENORM-LABEL: name: test_3xfloat_add_mul_rhs
@@ -1122,10 +1188,15 @@ body: |
; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
- ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
- ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-DENORM-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX9-DENORM-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C]](s32)
+ ; GFX9-DENORM-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX9-DENORM-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C1]](s32)
+ ; GFX9-DENORM-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX9-DENORM-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C2]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
;
; GFX9-UNSAFE-LABEL: name: test_3xfloat_add_mul_rhs
@@ -1144,10 +1215,15 @@ body: |
; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-UNSAFE-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX9-UNSAFE-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C]](s32)
+ ; GFX9-UNSAFE-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX9-UNSAFE-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C1]](s32)
+ ; GFX9-UNSAFE-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX9-UNSAFE-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C2]](s32)
+ ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+ ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+ ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
;
; GFX10-LABEL: name: test_3xfloat_add_mul_rhs
@@ -1167,10 +1243,15 @@ body: |
; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
- ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C]](s32)
+ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C1]](s32)
+ ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C2]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
;
; GFX10-CONTRACT-LABEL: name: test_3xfloat_add_mul_rhs
@@ -1189,10 +1270,15 @@ body: |
; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-CONTRACT-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-CONTRACT-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C]](s32)
+ ; GFX10-CONTRACT-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-CONTRACT-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C1]](s32)
+ ; GFX10-CONTRACT-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-CONTRACT-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C2]](s32)
+ ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+ ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+ ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
;
; GFX10-DENORM-LABEL: name: test_3xfloat_add_mul_rhs
@@ -1212,10 +1298,15 @@ body: |
; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
- ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
- ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-DENORM-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-DENORM-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C]](s32)
+ ; GFX10-DENORM-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-DENORM-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C1]](s32)
+ ; GFX10-DENORM-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-DENORM-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C2]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
;
; GFX10-UNSAFE-LABEL: name: test_3xfloat_add_mul_rhs
@@ -1234,10 +1325,15 @@ body: |
; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-UNSAFE-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-UNSAFE-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C]](s32)
+ ; GFX10-UNSAFE-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX10-UNSAFE-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C1]](s32)
+ ; GFX10-UNSAFE-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; GFX10-UNSAFE-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C2]](s32)
+ ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+ ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[EVEC1]](s32)
+ ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[EVEC2]](s32)
; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%4:_(s32) = COPY $vgpr0
%5:_(s32) = COPY $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
index 2845a632a84b36..5777ecfce459fb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
@@ -15,8 +15,9 @@ body: |
; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr2_vgpr3
- ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+ ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64)
+ ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], %el1
; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
%0:_(s32) = COPY $vgpr0
@@ -45,8 +46,9 @@ body: |
; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr2_vgpr3
- ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+ ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64)
+ ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], %el1
; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
%0:_(s32) = COPY $vgpr0
@@ -77,8 +79,9 @@ body: |
; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr0_vgpr1
- ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+ ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64)
+ ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT]], [[FPEXT1]], %el1
@@ -114,8 +117,9 @@ body: |
; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr0_vgpr1
- ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+ ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64)
+ ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT]], [[FPEXT1]], %el1
@@ -147,8 +151,9 @@ body: |
; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr4_vgpr5
- ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+ ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64)
+ ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY2]], [[COPY3]], %el1
; GFX10-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[FMA]]
; GFX10-NEXT: $vgpr0 = COPY [[FMA1]](s32)
@@ -179,8 +184,9 @@ body: |
; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr4_vgpr5
- ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+ ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64)
+ ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY2]], [[COPY3]], %el1
; GFX10-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[FMA]]
; GFX10-NEXT: $vgpr0 = COPY [[FMA1]](s32)
@@ -213,8 +219,9 @@ body: |
; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr2_vgpr3
- ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+ ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64)
+ ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr4
; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr5
@@ -258,8 +265,9 @@ body: |
; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr2_vgpr3
- ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+ ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64)
+ ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr4
; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr5
@@ -304,8 +312,9 @@ body: |
; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr0_vgpr1
- ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+ ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64)
+ ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr2
; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr3
; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr4
@@ -347,8 +356,9 @@ body: |
; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr0_vgpr1
- ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+ ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64)
+ ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr2
; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr3
@@ -399,8 +409,9 @@ body: |
; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr0_vgpr1
- ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+ ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64)
+ ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
; GFX10-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG %el1
; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[FNEG]]
; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
@@ -430,8 +441,9 @@ body: |
; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr2_vgpr3
- ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+ ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64)
+ ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
; GFX10-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]]
; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[COPY1]], %el1
; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll
index 9eeb633f0a817c..e91251186a18d3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll
@@ -68,8 +68,20 @@ define <2 x i16> @halfinsts_add_v2i16(<2 x i16> %arg0) #1 {
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY]]
; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY1]]
- ; CHECK-NEXT: $vgpr0 = COPY [[ADD]](s32)
- ; CHECK-NEXT: $vgpr1 = COPY [[ADD1]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C]]
+ ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C]]
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
+ ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST]](<2 x s16>)
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C2]](s32)
+ ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST]](<2 x s16>)
+ ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C1]](s32)
+ ; CHECK-NEXT: $vgpr0 = COPY [[LSHR]](s32)
+ ; CHECK-NEXT: $vgpr1 = COPY [[LSHR1]](s32)
; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
%add = add <2 x i16> %arg0, %arg0
ret <2 x i16> %add
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 5ba036c386a402..d723ccccda6953 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -774,20 +774,23 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6
; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5
; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v6, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v4, v6, v6
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v5, v6, v2
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v5, v6, v2
; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6
; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_fdiv_v2f16:
@@ -826,6 +829,9 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-IEEE-LABEL: v_fdiv_v2f16:
@@ -1076,16 +1082,19 @@ define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) {
; GFX6-LABEL: v_fdiv_v2f16_afn:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_rcp_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v3, v3
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT: v_rcp_f32_e32 v2, v2
; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fdiv_v2f16_afn:
@@ -1152,20 +1161,23 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6
; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5
; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v6, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v4, v6, v6
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v5, v6, v2
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v5, v6, v2
; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6
; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_fdiv_v2f16_ulp25:
@@ -1204,6 +1216,9 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-IEEE-LABEL: v_fdiv_v2f16_ulp25:
@@ -1467,20 +1482,23 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v1, v1, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4
; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v4, v5, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v5, v5
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v6, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v5, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v4, v6, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v6
; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_v2f16:
@@ -1519,6 +1537,9 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-IEEE-LABEL: v_rcp_v2f16:
@@ -1770,20 +1791,23 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v1, v1, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4
; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v4, v5, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v5, v5
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v6, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v5, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v4, v6, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v6
; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_neg_rcp_v2f16:
@@ -1822,6 +1846,9 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-IEEE-LABEL: v_neg_rcp_v2f16:
@@ -2067,6 +2094,7 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
@@ -2076,22 +2104,24 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v3, v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v4, v5, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v5, v5
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v6, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v5, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v4, v6, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v2
+; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_v2f16_fabs:
@@ -2117,24 +2147,27 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v3, v2, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
-; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v4
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v0, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4
-; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v4
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-IEEE-LABEL: v_rcp_v2f16_fabs:
@@ -2389,6 +2422,7 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
@@ -2398,22 +2432,24 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v3, v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v4, v5, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v5, v5
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v6, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v5, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v4, v6, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v2
+; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_neg_rcp_v2f16_fabs:
@@ -2439,24 +2475,27 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v3, v2, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
-; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v4
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v0, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4
-; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v4
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-IEEE-LABEL: v_neg_rcp_v2f16_fabs:
@@ -2717,20 +2756,23 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v1, v1, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4
; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v4, v5, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v5, v5
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v6, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v5, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v4, v6, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v6
; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp:
@@ -2769,6 +2811,9 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_rcp_v2f16_arcp:
@@ -2812,15 +2857,18 @@ define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
; GFX6-LABEL: v_rcp_v2f16_arcp_afn:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, 1.0
-; GFX6-NEXT: v_rcp_f32_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_f32_e32 v0, v2, v0
+; GFX6-NEXT: v_rcp_f32_e32 v0, v0
; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_mul_f32_e32 v0, v2, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_rcp_v2f16_arcp_afn:
@@ -2877,20 +2925,23 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v1, v1, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4
; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v4, v5, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v5, v5
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v6, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v5, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v4, v6, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v6
; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_v2f16_ulp25:
@@ -2929,6 +2980,9 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-IEEE-LABEL: v_rcp_v2f16_ulp25:
@@ -3167,16 +3221,19 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX6-LABEL: v_fdiv_v2f16_afn_ulp25:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_rcp_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v3, v3
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT: v_rcp_f32_e32 v2, v2
; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fdiv_v2f16_afn_ulp25:
@@ -3243,20 +3300,23 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6
; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5
; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v6, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v4, v6, v6
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v5, v6, v2
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v5, v6, v2
; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6
; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_fdiv_v2f16_arcp_ulp25:
@@ -3295,6 +3355,9 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fdiv_v2f16_arcp_ulp25:
@@ -3347,16 +3410,19 @@ define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX6-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_rcp_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v3, v3
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT: v_rcp_f32_e32 v2, v2
; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
@@ -5395,8 +5461,11 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5]
; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4
; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rsq_v2f16:
@@ -5441,6 +5510,9 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-IEEE-LABEL: v_rsq_v2f16:
@@ -5709,8 +5781,11 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5]
; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4
; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_neg_rsq_v2f16:
@@ -5755,6 +5830,9 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-IEEE-LABEL: v_neg_rsq_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
index 99e6c5d06a0e19..f3237a2612616f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
@@ -237,16 +237,19 @@ define <2 x half> @v_fma_v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z) {
; GFX6-LABEL: v_fma_v2f16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_v2f16:
@@ -291,16 +294,19 @@ define <2 x half> @v_fma_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y, <2 x half>
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_v2f16_fneg_lhs:
@@ -347,16 +353,19 @@ define <2 x half> @v_fma_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y, <2 x half>
; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
; GFX6-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_v2f16_fneg_rhs:
@@ -398,16 +407,19 @@ define <2 x half> @v_fma_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y, <2 x h
; GFX6-LABEL: v_fma_v2f16_fneg_lhs_rhs:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_v2f16_fneg_lhs_rhs:
@@ -511,22 +523,28 @@ define <4 x half> @v_fma_v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z) {
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v8
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7
; GFX6-NEXT: v_fma_f32 v0, v0, v4, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v9
+; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v11
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6
-; GFX6-NEXT: v_fma_f32 v1, v1, v5, v9
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v10
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v7
-; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v11
-; GFX6-NEXT: v_fma_f32 v2, v2, v4, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v10
+; GFX6-NEXT: v_fma_f32 v1, v1, v4, v5
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_fma_f32 v3, v3, v6, v7
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_fma_f32 v3, v3, v7, v9
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_fma_f32 v2, v2, v6, v8
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_v4f16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
index 543f8e413abd86..882eacafef1956 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
@@ -144,8 +144,12 @@ define <3 x half> @v_fmul_v3f16_fneg_lhs(<3 x half> %a, <3 x half> %b) {
; GFX8-LABEL: v_fmul_v3f16_fneg_lhs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x80008000
+; GFX8-NEXT: v_xor_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3
@@ -174,8 +178,12 @@ define <3 x half> @v_fmul_v3f16_fneg_rhs(<3 x half> %a, <3 x half> %b) {
; GFX8-LABEL: v_fmul_v3f16_fneg_rhs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
-; GFX8-NEXT: v_xor_b32_e32 v3, 0x80008000, v3
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x80008000
+; GFX8-NEXT: v_xor_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
index 0577117e9d9e1d..228d30a040aadf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
@@ -376,31 +376,34 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
; GFX6-LABEL: v_pow_v2f16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_mov_b32_e32 v4, 0xc2fc0000
+; GFX6-NEXT: v_log_f32_e32 v1, v1
; GFX6-NEXT: v_log_f32_e32 v0, v0
+; GFX6-NEXT: v_mov_b32_e32 v4, 0xc2fc0000
; GFX6-NEXT: v_mov_b32_e32 v5, 0x42800000
-; GFX6-NEXT: v_log_f32_e32 v1, v1
+; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v3
; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v2
+; GFX6-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, v4
; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v4
+; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[4:5]
; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc
+; GFX6-NEXT: v_add_f32_e32 v1, v1, v3
; GFX6-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT: v_mov_b32_e32 v3, 0x1f800000
-; GFX6-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc
-; GFX6-NEXT: v_exp_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2
-; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v1, v4
-; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc
-; GFX6-NEXT: v_add_f32_e32 v1, v1, v2
; GFX6-NEXT: v_exp_f32_e32 v1, v1
-; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v6
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_exp_f32_e32 v0, v0
+; GFX6-NEXT: v_mov_b32_e32 v2, 0x1f800000
+; GFX6-NEXT: v_cndmask_b32_e32 v3, 1.0, v2, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v2, 1.0, v2, s[4:5]
; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_pow_v2f16:
@@ -506,21 +509,24 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2
; GFX6-NEXT: v_mov_b32_e32 v2, 0xc2fc0000
; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v1, v2
+; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v3
; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; GFX6-NEXT: v_cmp_lt_f32_e64 s[4:5], v0, v2
; GFX6-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX6-NEXT: v_mov_b32_e32 v5, 0x1f800000
-; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v3
-; GFX6-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc
-; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[4:5]
; GFX6-NEXT: v_exp_f32_e32 v1, v1
; GFX6-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX6-NEXT: v_exp_f32_e32 v2, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, v1, v6
-; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v5, vcc
-; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX6-NEXT: v_exp_f32_e32 v0, v0
+; GFX6-NEXT: v_mov_b32_e32 v5, 0x1f800000
+; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v5, vcc
+; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT: v_cndmask_b32_e64 v2, 1.0, v5, s[4:5]
+; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_pow_v2f16_fneg_lhs:
@@ -620,9 +626,9 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
; GFX6-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX6-NEXT: v_log_f32_e32 v0, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
@@ -632,21 +638,24 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
; GFX6-NEXT: v_mov_b32_e32 v3, 0xc2fc0000
; GFX6-NEXT: v_mov_b32_e32 v4, 0x42800000
; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3
+; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2
; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; GFX6-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, v3
; GFX6-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX6-NEXT: v_mov_b32_e32 v5, 0x1f800000
-; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2
-; GFX6-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc
-; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v1, v3
-; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
-; GFX6-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[4:5]
; GFX6-NEXT: v_exp_f32_e32 v0, v0
+; GFX6-NEXT: v_add_f32_e32 v1, v1, v2
; GFX6-NEXT: v_exp_f32_e32 v1, v1
+; GFX6-NEXT: v_mov_b32_e32 v5, 0x1f800000
; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v5, vcc
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v6
+; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT: v_cndmask_b32_e64 v2, 1.0, v5, s[4:5]
; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_pow_v2f16_fneg_rhs:
@@ -748,11 +757,11 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX6-NEXT: v_log_f32_e32 v3, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
@@ -762,21 +771,24 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
; GFX6-NEXT: v_mov_b32_e32 v3, 0xc2fc0000
; GFX6-NEXT: v_mov_b32_e32 v4, 0x42800000
; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v2, v3
+; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; GFX6-NEXT: v_cmp_lt_f32_e64 s[4:5], v0, v3
; GFX6-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX6-NEXT: v_mov_b32_e32 v5, 0x1f800000
-; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
-; GFX6-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc
-; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3
-; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, v4, s[4:5]
; GFX6-NEXT: v_exp_f32_e32 v2, v2
; GFX6-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX6-NEXT: v_exp_f32_e32 v1, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, v2, v6
-; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v5, vcc
-; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT: v_exp_f32_e32 v0, v0
+; GFX6-NEXT: v_mov_b32_e32 v5, 0x1f800000
+; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v5, vcc
+; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX6-NEXT: v_cndmask_b32_e64 v2, 1.0, v5, s[4:5]
+; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_pow_v2f16_fneg_lhs_rhs:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 3bd3486ec261d4..3dc014a3588dd2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -3983,6 +3983,11 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4
; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2
; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshl_v2i16:
@@ -4063,6 +4068,11 @@ define <2 x i16> @v_fshl_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 7, v2
; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshl_v2i16_4_8:
@@ -5037,7 +5047,17 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
; GFX6-NEXT: v_bfe_u32 v4, v7, 1, 15
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: v_or_b32_e32 v3, v3, v4
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v1, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshl_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 58304d2072d7f6..b12ad74462e7ef 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -3763,6 +3763,11 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4
; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2
; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshr_v2i16:
@@ -3852,6 +3857,11 @@ define <2 x i16> @v_fshr_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 7, v2
; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshr_v2i16_4_8:
@@ -4341,6 +4351,10 @@ define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
;
; GFX8-LABEL: s_fshr_v3i16:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_lshr_b32 s8, s4, 16
+; GFX8-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX8-NEXT: s_lshl_b32 s8, s8, 16
+; GFX8-NEXT: s_or_b32 s4, s4, s8
; GFX8-NEXT: s_and_b32 s8, 0xffff, s2
; GFX8-NEXT: s_lshr_b32 s6, s0, 16
; GFX8-NEXT: s_lshr_b32 s7, s2, 16
@@ -4373,6 +4387,7 @@ define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
; GFX8-NEXT: s_lshr_b32 s4, s6, s4
; GFX8-NEXT: s_or_b32 s2, s2, s4
; GFX8-NEXT: s_and_b32 s4, 0xffff, s3
+; GFX8-NEXT: s_and_b32 s5, s5, 0xffff
; GFX8-NEXT: s_lshl_b32 s1, s1, 1
; GFX8-NEXT: s_lshr_b32 s4, s4, 15
; GFX8-NEXT: s_or_b32 s1, s1, s4
@@ -4593,6 +4608,9 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
; GFX8-LABEL: v_fshr_v3i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_or_b32_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v7, 15, v2
; GFX8-NEXT: v_or_b32_e32 v6, v6, v7
@@ -4623,7 +4641,7 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 15, v3
; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
; GFX8-NEXT: v_lshlrev_b16_e32 v2, 1, v3
-; GFX8-NEXT: v_xor_b32_e32 v3, -1, v5
+; GFX8-NEXT: v_xor_b32_sdwa v3, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_and_b32_e32 v4, 15, v3
; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3
; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
@@ -5013,36 +5031,46 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v8
; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4
; GFX6-NEXT: v_or_b32_e32 v1, v1, v4
-; GFX6-NEXT: v_bfe_u32 v4, v6, 1, 15
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 14, v4
-; GFX6-NEXT: v_or_b32_e32 v2, v2, v4
-; GFX6-NEXT: v_bfe_u32 v4, v7, 1, 15
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 14, v4
-; GFX6-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v6
-; GFX6-NEXT: v_xor_b32_e32 v6, -1, v9
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v7
-; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX6-NEXT: v_and_b32_e32 v8, 15, v6
-; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6
-; GFX6-NEXT: v_and_b32_e32 v6, 15, v6
-; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX6-NEXT: v_bfe_u32 v4, v4, 1, 15
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v8, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4
-; GFX6-NEXT: v_or_b32_e32 v2, v2, v4
-; GFX6-NEXT: v_and_b32_e32 v4, 15, v7
-; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7
-; GFX6-NEXT: v_and_b32_e32 v6, 15, v6
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3
-; GFX6-NEXT: v_bfe_u32 v4, v5, 1, 15
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v6
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4
-; GFX6-NEXT: v_or_b32_e32 v3, v3, v4
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v2
+; GFX6-NEXT: v_bfe_u32 v2, v6, 1, 15
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 14, v2
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v3
+; GFX6-NEXT: v_bfe_u32 v3, v7, 1, 15
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 14, v3
+; GFX6-NEXT: v_xor_b32_e32 v5, -1, v9
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v7
+; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v5
+; GFX6-NEXT: v_and_b32_e32 v7, 15, v5
+; GFX6-NEXT: v_xor_b32_e32 v5, -1, v5
+; GFX6-NEXT: v_and_b32_e32 v5, 15, v5
+; GFX6-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, v7, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX6-NEXT: v_and_b32_e32 v3, 15, v6
+; GFX6-NEXT: v_xor_b32_e32 v5, -1, v6
+; GFX6-NEXT: v_and_b32_e32 v5, 15, v5
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, v3, v2
+; GFX6-NEXT: v_bfe_u32 v3, v4, 1, 15
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v1, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshr_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll
index 8e4e4cf2c5b87f..cd02df5882ca1b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll
@@ -27,10 +27,14 @@ define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords)
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: load_1d
@@ -55,10 +59,14 @@ define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords)
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: load_1d
@@ -83,10 +91,14 @@ define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords)
; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%s = extractelement <2 x i16> %coords, i32 0
@@ -120,10 +132,14 @@ define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords)
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: load_2d
@@ -151,10 +167,14 @@ define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords)
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: load_2d
@@ -182,10 +202,14 @@ define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords)
; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%s = extractelement <2 x i16> %coords, i32 0
@@ -227,10 +251,14 @@ define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: load_3d
@@ -265,10 +293,14 @@ define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l
; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: load_3d
@@ -302,10 +334,14 @@ define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l
; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
@@ -348,10 +384,14 @@ define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: load_cube
@@ -386,10 +426,14 @@ define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords
; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: load_cube
@@ -423,10 +467,14 @@ define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords
; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.cube), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
@@ -462,10 +510,14 @@ define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coo
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1darray), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: load_1darray
@@ -493,10 +545,14 @@ define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coo
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1darray), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: load_1darray
@@ -524,10 +580,14 @@ define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coo
; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1darray), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%s = extractelement <2 x i16> %coords, i32 0
@@ -569,10 +629,14 @@ define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coo
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: load_2darray
@@ -607,10 +671,14 @@ define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coo
; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: load_2darray
@@ -644,10 +712,14 @@ define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coo
; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
@@ -690,10 +762,14 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2dmsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: load_2dmsaa
@@ -728,10 +804,14 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2dmsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: load_2dmsaa
@@ -765,10 +845,14 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2dmsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
@@ -813,10 +897,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16>
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: load_2darraymsaa
@@ -853,10 +941,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16>
; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: load_2darraymsaa
@@ -892,10 +984,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16>
; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
@@ -932,10 +1028,14 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: load_mip_1d
@@ -963,10 +1063,14 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: load_mip_1d
@@ -994,10 +1098,14 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%s = extractelement <2 x i16> %coords, i32 0
@@ -1039,10 +1147,14 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: load_mip_2d
@@ -1077,10 +1189,14 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: load_mip_2d
@@ -1114,10 +1230,14 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
@@ -1162,10 +1282,14 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: load_mip_3d
@@ -1202,10 +1326,14 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: load_mip_3d
@@ -1241,10 +1369,14 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
@@ -1290,10 +1422,14 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %co
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: load_mip_cube
@@ -1330,10 +1466,14 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %co
; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: load_mip_cube
@@ -1369,10 +1509,14 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %co
; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
@@ -1416,10 +1560,14 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16>
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: load_mip_1darray
@@ -1454,10 +1602,14 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16>
; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: load_mip_1darray
@@ -1491,10 +1643,14 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16>
; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
@@ -1539,10 +1695,14 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16>
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: load_mip_2darray
@@ -1579,10 +1739,14 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16>
; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: load_mip_2darray
@@ -1618,10 +1782,14 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16>
; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
@@ -3283,10 +3451,14 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %co
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: getresinfo_1d
@@ -3311,10 +3483,14 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %co
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: getresinfo_1d
@@ -3339,10 +3515,14 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %co
; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%mip = extractelement <2 x i16> %coords, i32 0
@@ -3373,10 +3553,14 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %co
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: getresinfo_2d
@@ -3401,10 +3585,14 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %co
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: getresinfo_2d
@@ -3429,10 +3617,14 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %co
; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%mip = extractelement <2 x i16> %coords, i32 0
@@ -3463,10 +3655,14 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %co
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.3d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: getresinfo_3d
@@ -3491,10 +3687,14 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %co
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.3d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: getresinfo_3d
@@ -3519,10 +3719,14 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %co
; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.3d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%mip = extractelement <2 x i16> %coords, i32 0
@@ -3553,10 +3757,14 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> %
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.cube), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: getresinfo_cube
@@ -3581,10 +3789,14 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> %
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.cube), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: getresinfo_cube
@@ -3609,10 +3821,14 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> %
; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.cube), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%mip = extractelement <2 x i16> %coords, i32 0
@@ -3643,10 +3859,14 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: getresinfo_1darray
@@ -3671,10 +3891,14 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: getresinfo_1darray
@@ -3699,10 +3923,14 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16
; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%mip = extractelement <2 x i16> %coords, i32 0
@@ -3733,10 +3961,14 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: getresinfo_2darray
@@ -3761,10 +3993,14 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: getresinfo_2darray
@@ -3789,10 +4025,14 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16
; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%mip = extractelement <2 x i16> %coords, i32 0
@@ -3823,10 +4063,14 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16>
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2dmsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: getresinfo_2dmsaa
@@ -3851,10 +4095,14 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16>
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2dmsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: getresinfo_2dmsaa
@@ -3879,10 +4127,14 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16>
; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2dmsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%mip = extractelement <2 x i16> %coords, i32 0
@@ -3913,10 +4165,14 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darraymsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: getresinfo_2darraymsaa
@@ -3941,10 +4197,14 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darraymsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: getresinfo_2darraymsaa
@@ -3969,10 +4229,14 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x
; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darraymsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%mip = extractelement <2 x i16> %coords, i32 0
@@ -4081,8 +4345,10 @@ define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coord
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 9, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<2 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
;
; GFX10NSA-LABEL: name: load_1d_V2
@@ -4107,8 +4373,10 @@ define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coord
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 9, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<2 x s32>), addrspace 8)
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
;
; GFX12-LABEL: name: load_1d_V2
@@ -4133,8 +4401,10 @@ define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coord
; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 9, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<2 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
main_body:
%s = extractelement <2 x i16> %coords, i32 0
@@ -4327,10 +4597,14 @@ define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: load_1d_glc
@@ -4355,10 +4629,14 @@ define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: load_1d_glc
@@ -4383,10 +4661,14 @@ define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%s = extractelement <2 x i16> %coords, i32 0
@@ -4417,10 +4699,14 @@ define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: load_1d_slc
@@ -4445,10 +4731,14 @@ define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: load_1d_slc
@@ -4473,10 +4763,14 @@ define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%s = extractelement <2 x i16> %coords, i32 0
@@ -4507,10 +4801,14 @@ define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> %
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: load_1d_glc_slc
@@ -4535,10 +4833,14 @@ define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> %
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: load_1d_glc_slc
@@ -4563,10 +4865,14 @@ define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> %
; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%s = extractelement <2 x i16> %coords, i32 0
@@ -4851,10 +5157,14 @@ define amdgpu_ps <4 x float> @getresinfo_dmask0(<8 x i32> inreg %rsrc, <4 x floa
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY1]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY2]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY3]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: getresinfo_dmask0
@@ -4863,10 +5173,14 @@ define amdgpu_ps <4 x float> @getresinfo_dmask0(<8 x i32> inreg %rsrc, <4 x floa
; GFX10NSA-NEXT: {{ $}}
; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<4 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY1]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY2]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY3]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: getresinfo_dmask0
@@ -4875,10 +5189,14 @@ define amdgpu_ps <4 x float> @getresinfo_dmask0(<8 x i32> inreg %rsrc, <4 x floa
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY1]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY2]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY3]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%mip = extractelement <2 x i16> %coords, i32 0
@@ -4911,10 +5229,14 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
; GFX9-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: load_1d_tfe
@@ -4941,10 +5263,14 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
; GFX10NSA-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: load_1d_tfe
@@ -4971,10 +5297,14 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
; GFX12-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%s = extractelement <2 x i16> %coords, i32 0
@@ -5013,10 +5343,14 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
; GFX9-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: load_2d_tfe
@@ -5046,10 +5380,14 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
; GFX10NSA-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: load_2d_tfe
@@ -5079,10 +5417,14 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
; GFX12-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%s = extractelement <2 x i16> %coords, i32 0
@@ -5129,10 +5471,14 @@ define amdgpu_ps <4 x float> @load_3d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
; GFX9-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: load_3d_tfe
@@ -5169,10 +5515,14 @@ define amdgpu_ps <4 x float> @load_3d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
; GFX10NSA-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: load_3d_tfe
@@ -5208,10 +5558,14 @@ define amdgpu_ps <4 x float> @load_3d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
; GFX12-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
@@ -5261,10 +5615,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, <2 x i
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
; GFX9-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: load_2darraymsaa_tfe
@@ -5303,10 +5661,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, <2 x i
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
; GFX10NSA-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: load_2darraymsaa_tfe
@@ -5344,10 +5706,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, <2 x i
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
; GFX12-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll
index f61f985cd24ab1..294172336aef03 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll
@@ -119,6 +119,7 @@ define amdgpu_ps <3 x half> @image_load_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32
; UNPACKED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; UNPACKED-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD_D16_:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD_D16 intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<3 x s16>), align 8, addrspace 8)
; UNPACKED-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<3 x s32>)
+ ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
; UNPACKED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C]]
; UNPACKED-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C]]
@@ -126,7 +127,7 @@ define amdgpu_ps <3 x half> @image_load_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32
; UNPACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
; UNPACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
; UNPACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
- ; UNPACKED-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV2]], [[C]]
+ ; UNPACKED-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]]
; UNPACKED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; UNPACKED-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C1]](s32)
; UNPACKED-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
@@ -363,6 +364,7 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16(<8 x i32> inreg %rsrc, i32 %s,
; UNPACKED-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD_D16_:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD_D16 intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (<3 x s16>), align 8, addrspace 8)
; UNPACKED-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<4 x s32>)
; UNPACKED-NEXT: G_STORE [[UV3]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+ ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
; UNPACKED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C]]
; UNPACKED-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C]]
@@ -370,7 +372,7 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16(<8 x i32> inreg %rsrc, i32 %s,
; UNPACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
; UNPACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
; UNPACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
- ; UNPACKED-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV2]], [[C]]
+ ; UNPACKED-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]]
; UNPACKED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; UNPACKED-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C1]](s32)
; UNPACKED-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
@@ -598,6 +600,8 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1100(<8 x i32> inreg %rsrc,
; UNPACKED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; UNPACKED-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD_D16_:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD_D16 intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<2 x s16>), addrspace 8)
; UNPACKED-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<2 x s32>)
+ ; UNPACKED-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
; UNPACKED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C]]
; UNPACKED-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C]]
@@ -605,9 +609,10 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1100(<8 x i32> inreg %rsrc,
; UNPACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
; UNPACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
; UNPACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+ ; UNPACKED-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]]
; UNPACKED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; UNPACKED-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C1]](s32)
- ; UNPACKED-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]]
+ ; UNPACKED-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
; UNPACKED-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
; UNPACKED-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
; UNPACKED-NEXT: $vgpr1 = COPY [[BITCAST1]](<2 x s16>)
@@ -630,10 +635,14 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1100(<8 x i32> inreg %rsrc,
; PACKED-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; PACKED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; PACKED-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD_D16_:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD_D16 intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<2 x s16>), addrspace 8)
- ; PACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; PACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; PACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C1]](s32)
- ; PACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]]
+ ; PACKED-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; PACKED-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+ ; PACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+ ; PACKED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]]
+ ; PACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; PACKED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; PACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32)
+ ; PACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
; PACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
; PACKED-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<2 x s16>)
; PACKED-NEXT: $vgpr1 = COPY [[BITCAST]](<2 x s16>)
@@ -660,6 +669,9 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1000(<8 x i32> inreg %rsrc,
; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; UNPACKED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; UNPACKED-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD_D16_:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD_D16 intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (s16), addrspace 8)
+ ; UNPACKED-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+ ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
; UNPACKED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]], [[C]]
; UNPACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
@@ -667,7 +679,8 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1000(<8 x i32> inreg %rsrc,
; UNPACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32)
; UNPACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
; UNPACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
- ; UNPACKED-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C1]], [[SHL]]
+ ; UNPACKED-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C]]
+ ; UNPACKED-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]]
; UNPACKED-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
; UNPACKED-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
; UNPACKED-NEXT: $vgpr1 = COPY [[BITCAST1]](<2 x s16>)
@@ -690,10 +703,14 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1000(<8 x i32> inreg %rsrc,
; PACKED-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; PACKED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; PACKED-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD_D16_:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD_D16 intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (s16), addrspace 8)
- ; PACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; PACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; PACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C1]](s32)
- ; PACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]]
+ ; PACKED-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; PACKED-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+ ; PACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+ ; PACKED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]]
+ ; PACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; PACKED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; PACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32)
+ ; PACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
; PACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
; PACKED-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<2 x s16>)
; PACKED-NEXT: $vgpr1 = COPY [[BITCAST]](<2 x s16>)
@@ -1145,6 +1162,8 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1100(<8 x i32> inreg %rs
; UNPACKED-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD_D16_:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD_D16 intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (<2 x s16>), addrspace 8)
; UNPACKED-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<3 x s32>)
; UNPACKED-NEXT: G_STORE [[UV2]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+ ; UNPACKED-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
; UNPACKED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C]]
; UNPACKED-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C]]
@@ -1152,9 +1171,10 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1100(<8 x i32> inreg %rs
; UNPACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
; UNPACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
; UNPACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+ ; UNPACKED-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]]
; UNPACKED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; UNPACKED-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C1]](s32)
- ; UNPACKED-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]]
+ ; UNPACKED-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
; UNPACKED-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
; UNPACKED-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
; UNPACKED-NEXT: $vgpr1 = COPY [[BITCAST1]](<2 x s16>)
@@ -1181,10 +1201,14 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1100(<8 x i32> inreg %rs
; PACKED-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<2 x s32>)
; PACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32)
; PACKED-NEXT: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; PACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; PACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; PACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C1]](s32)
- ; PACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]]
+ ; PACKED-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; PACKED-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+ ; PACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+ ; PACKED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]]
+ ; PACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; PACKED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; PACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32)
+ ; PACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
; PACKED-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
; PACKED-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
; PACKED-NEXT: $vgpr1 = COPY [[BITCAST1]](<2 x s16>)
@@ -1217,6 +1241,9 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1000(<8 x i32> inreg %rs
; UNPACKED-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD_D16_:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD_D16 intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (s16), addrspace 8)
; UNPACKED-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<2 x s32>)
; UNPACKED-NEXT: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+ ; UNPACKED-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+ ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
; UNPACKED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C]]
; UNPACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
@@ -1224,7 +1251,8 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1000(<8 x i32> inreg %rs
; UNPACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32)
; UNPACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
; UNPACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
- ; UNPACKED-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C1]], [[SHL]]
+ ; UNPACKED-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C]]
+ ; UNPACKED-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]]
; UNPACKED-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
; UNPACKED-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
; UNPACKED-NEXT: $vgpr1 = COPY [[BITCAST1]](<2 x s16>)
@@ -1251,10 +1279,14 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1000(<8 x i32> inreg %rs
; PACKED-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<2 x s32>)
; PACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32)
; PACKED-NEXT: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; PACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; PACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; PACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C1]](s32)
- ; PACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]]
+ ; PACKED-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; PACKED-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+ ; PACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+ ; PACKED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]]
+ ; PACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; PACKED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; PACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32)
+ ; PACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
; PACKED-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
; PACKED-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
; PACKED-NEXT: $vgpr1 = COPY [[BITCAST1]](<2 x s16>)
@@ -1287,6 +1319,9 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_0000(<8 x i32> inreg %rs
; UNPACKED-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD_D16_:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD_D16 intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (s16), addrspace 8)
; UNPACKED-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<2 x s32>)
; UNPACKED-NEXT: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+ ; UNPACKED-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+ ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
; UNPACKED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C]]
; UNPACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
@@ -1294,7 +1329,8 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_0000(<8 x i32> inreg %rs
; UNPACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32)
; UNPACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
; UNPACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
- ; UNPACKED-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C1]], [[SHL]]
+ ; UNPACKED-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C]]
+ ; UNPACKED-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]]
; UNPACKED-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
; UNPACKED-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
; UNPACKED-NEXT: $vgpr1 = COPY [[BITCAST1]](<2 x s16>)
@@ -1321,10 +1357,14 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_0000(<8 x i32> inreg %rs
; PACKED-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<2 x s32>)
; PACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32)
; PACKED-NEXT: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; PACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; PACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; PACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C1]](s32)
- ; PACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]]
+ ; PACKED-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; PACKED-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+ ; PACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+ ; PACKED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]]
+ ; PACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; PACKED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; PACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32)
+ ; PACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
; PACKED-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
; PACKED-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
; PACKED-NEXT: $vgpr1 = COPY [[BITCAST1]](<2 x s16>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll
index adf7e6d38b989b..52030a90ef66e4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll
@@ -44,8 +44,10 @@ define amdgpu_ps <2 x float> @image_load_v2f32(<8 x i32> inreg %rsrc, i32 %s, i3
; GCN-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GCN-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<2 x s32>), addrspace 8)
; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
- ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32)
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
%tex = call <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret <2 x float> %tex
@@ -70,9 +72,12 @@ define amdgpu_ps <3 x float> @image_load_v3f32(<8 x i32> inreg %rsrc, i32 %s, i3
; GCN-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GCN-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<3 x s32>), align 16, addrspace 8)
; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>)
- ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GCN-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32)
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%tex = call <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret <3 x float> %tex
@@ -97,10 +102,14 @@ define amdgpu_ps <4 x float> @image_load_v4f32(<8 x i32> inreg %rsrc, i32 %s, i3
; GCN-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GCN-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GCN-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GCN-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GCN-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %tex
@@ -157,8 +166,10 @@ define amdgpu_ps <2 x float> @image_load_tfe_v2f32(<8 x i32> inreg %rsrc, i32 %s
; GCN-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (<2 x s32>), addrspace 8)
; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>)
; GCN-NEXT: G_STORE [[UV2]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32)
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
%res = call { <2 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v2f32i32s.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
%tex = extractvalue { <2 x float>, i32 } %res, 0
@@ -188,9 +199,12 @@ define amdgpu_ps <3 x float> @image_load_tfe_v3f32(<8 x i32> inreg %rsrc, i32 %s
; GCN-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (<3 x s32>), align 16, addrspace 8)
; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GCN-NEXT: G_STORE [[UV3]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GCN-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32)
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%res = call { <3 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f32i32s.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
%tex = extractvalue { <3 x float>, i32 } %res, 0
@@ -220,10 +234,14 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32(<8 x i32> inreg %rsrc, i32 %s
; GCN-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
; GCN-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GCN-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GCN-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GCN-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
%tex = extractvalue { <4 x float>, i32 } %res, 0
@@ -265,8 +283,10 @@ define amdgpu_ps <2 x float> @image_load_v2f32_dmask_1000(<8 x i32> inreg %rsrc,
; GCN-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GCN-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (s32), addrspace 8)
; GCN-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; GCN-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
- ; GCN-NEXT: $vgpr1 = COPY [[DEF]](s32)
+ ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
+ ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32)
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
%tex = call <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret <2 x float> %tex
@@ -281,8 +301,10 @@ define amdgpu_ps <2 x float> @image_load_v2f32_dmask_0000(<8 x i32> inreg %rsrc,
; GCN-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; GCN-NEXT: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<2 x s32>)
- ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+ ; GCN-NEXT: $vgpr1 = COPY [[COPY3]](s32)
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
%tex = call <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret <2 x float> %tex
@@ -308,9 +330,12 @@ define amdgpu_ps <3 x float> @image_load_v3f32_dmask_1100(<8 x i32> inreg %rsrc,
; GCN-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<2 x s32>), addrspace 8)
; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
; GCN-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GCN-NEXT: $vgpr2 = COPY [[DEF]](s32)
+ ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32)
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%tex = call <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret <3 x float> %tex
@@ -335,9 +360,12 @@ define amdgpu_ps <3 x float> @image_load_v3f32_dmask_1000(<8 x i32> inreg %rsrc,
; GCN-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GCN-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (s32), addrspace 8)
; GCN-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; GCN-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
- ; GCN-NEXT: $vgpr1 = COPY [[DEF]](s32)
- ; GCN-NEXT: $vgpr2 = COPY [[DEF]](s32)
+ ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
+ ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+ ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32)
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%tex = call <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret <3 x float> %tex
@@ -352,9 +380,12 @@ define amdgpu_ps <3 x float> @image_load_v3f32_dmask_0000(<8 x i32> inreg %rsrc,
; GCN-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; GCN-NEXT: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF
; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<3 x s32>)
- ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GCN-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+ ; GCN-NEXT: $vgpr1 = COPY [[COPY3]](s32)
+ ; GCN-NEXT: $vgpr2 = COPY [[COPY4]](s32)
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%tex = call <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret <3 x float> %tex
@@ -380,10 +411,14 @@ define amdgpu_ps <4 x float> @image_load_v4f32_dmask_1110(<8 x i32> inreg %rsrc,
; GCN-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<3 x s32>), align 16, addrspace 8)
; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>)
; GCN-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GCN-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GCN-NEXT: $vgpr3 = COPY [[DEF]](s32)
+ ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GCN-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %tex
@@ -409,10 +444,14 @@ define amdgpu_ps <4 x float> @image_load_v4f32_dmask_1100(<8 x i32> inreg %rsrc,
; GCN-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<2 x s32>), addrspace 8)
; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
; GCN-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GCN-NEXT: $vgpr2 = COPY [[DEF]](s32)
- ; GCN-NEXT: $vgpr3 = COPY [[DEF]](s32)
+ ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+ ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GCN-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %tex
@@ -437,10 +476,14 @@ define amdgpu_ps <4 x float> @image_load_v4f32_dmask_1000(<8 x i32> inreg %rsrc,
; GCN-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GCN-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (s32), addrspace 8)
; GCN-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; GCN-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
- ; GCN-NEXT: $vgpr1 = COPY [[DEF]](s32)
- ; GCN-NEXT: $vgpr2 = COPY [[DEF]](s32)
- ; GCN-NEXT: $vgpr3 = COPY [[DEF]](s32)
+ ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
+ ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+ ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+ ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GCN-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %tex
@@ -455,10 +498,14 @@ define amdgpu_ps <4 x float> @image_load_v4f32_dmask_0000(<8 x i32> inreg %rsrc,
; GCN-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; GCN-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<4 x s32>)
- ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GCN-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GCN-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+ ; GCN-NEXT: $vgpr1 = COPY [[COPY3]](s32)
+ ; GCN-NEXT: $vgpr2 = COPY [[COPY4]](s32)
+ ; GCN-NEXT: $vgpr3 = COPY [[COPY5]](s32)
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %tex
@@ -516,8 +563,10 @@ define amdgpu_ps <2 x float> @image_load_tfe_v2f32_dmask_1000(<8 x i32> inreg %r
; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
; GCN-NEXT: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
; GCN-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
- ; GCN-NEXT: $vgpr0 = COPY [[UV2]](s32)
- ; GCN-NEXT: $vgpr1 = COPY [[UV3]](s32)
+ ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32)
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
%res = call { <2 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v2f32i32s.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
%tex = extractvalue { <2 x float>, i32 } %res, 0
@@ -548,8 +597,10 @@ define amdgpu_ps <2 x float> @image_load_tfe_v2f32_dmask_0000(<8 x i32> inreg %r
; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
; GCN-NEXT: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
; GCN-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
- ; GCN-NEXT: $vgpr0 = COPY [[UV2]](s32)
- ; GCN-NEXT: $vgpr1 = COPY [[UV3]](s32)
+ ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32)
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
%res = call { <2 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v2f32i32s.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
%tex = extractvalue { <2 x float>, i32 } %res, 0
@@ -580,9 +631,12 @@ define amdgpu_ps <3 x float> @image_load_tfe_v3f32_dmask_1100(<8 x i32> inreg %r
; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>)
; GCN-NEXT: G_STORE [[UV2]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
; GCN-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>)
- ; GCN-NEXT: $vgpr0 = COPY [[UV3]](s32)
- ; GCN-NEXT: $vgpr1 = COPY [[UV4]](s32)
- ; GCN-NEXT: $vgpr2 = COPY [[UV5]](s32)
+ ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV4]](s32)
+ ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV5]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32)
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%res = call { <3 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f32i32s.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
%tex = extractvalue { <3 x float>, i32 } %res, 0
@@ -613,9 +667,12 @@ define amdgpu_ps <3 x float> @image_load_tfe_v3f32_dmask_1000(<8 x i32> inreg %r
; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GCN-NEXT: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GCN-NEXT: $vgpr1 = COPY [[DEF1]](s32)
- ; GCN-NEXT: $vgpr2 = COPY [[DEF1]](s32)
+ ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+ ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32)
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%res = call { <3 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f32i32s.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
%tex = extractvalue { <3 x float>, i32 } %res, 0
@@ -646,9 +703,12 @@ define amdgpu_ps <3 x float> @image_load_tfe_v3f32_dmask_0000(<8 x i32> inreg %r
; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GCN-NEXT: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GCN-NEXT: $vgpr1 = COPY [[DEF1]](s32)
- ; GCN-NEXT: $vgpr2 = COPY [[DEF1]](s32)
+ ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+ ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32)
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%res = call { <3 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f32i32s.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
%tex = extractvalue { <3 x float>, i32 } %res, 0
@@ -679,10 +739,14 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_1110(<8 x i32> inreg %r
; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GCN-NEXT: G_STORE [[UV3]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
; GCN-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GCN-NEXT: $vgpr0 = COPY [[UV4]](s32)
- ; GCN-NEXT: $vgpr1 = COPY [[UV5]](s32)
- ; GCN-NEXT: $vgpr2 = COPY [[UV6]](s32)
- ; GCN-NEXT: $vgpr3 = COPY [[UV7]](s32)
+ ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV4]](s32)
+ ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV5]](s32)
+ ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV6]](s32)
+ ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV7]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GCN-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
%tex = extractvalue { <4 x float>, i32 } %res, 0
@@ -713,10 +777,14 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_1100(<8 x i32> inreg %r
; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>)
; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GCN-NEXT: G_STORE [[UV2]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GCN-NEXT: $vgpr2 = COPY [[DEF1]](s32)
- ; GCN-NEXT: $vgpr3 = COPY [[DEF1]](s32)
+ ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+ ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GCN-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
%tex = extractvalue { <4 x float>, i32 } %res, 0
@@ -747,10 +815,14 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_1000(<8 x i32> inreg %r
; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GCN-NEXT: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GCN-NEXT: $vgpr1 = COPY [[DEF1]](s32)
- ; GCN-NEXT: $vgpr2 = COPY [[DEF1]](s32)
- ; GCN-NEXT: $vgpr3 = COPY [[DEF1]](s32)
+ ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+ ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+ ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GCN-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
%tex = extractvalue { <4 x float>, i32 } %res, 0
@@ -781,10 +853,14 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_0000(<8 x i32> inreg %r
; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GCN-NEXT: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GCN-NEXT: $vgpr1 = COPY [[DEF1]](s32)
- ; GCN-NEXT: $vgpr2 = COPY [[DEF1]](s32)
- ; GCN-NEXT: $vgpr3 = COPY [[DEF1]](s32)
+ ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+ ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+ ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32)
+ ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32)
+ ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32)
+ ; GCN-NEXT: $vgpr3 = COPY [[COPY13]](s32)
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
%tex = extractvalue { <4 x float>, i32 } %res, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll
index 4d36e0f7970167..3d90783b5cf69f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll
@@ -25,10 +25,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i3
; GFX6-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
; GFX6-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[BUILD_VECTOR1]](<4 x s32>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX6-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX6-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX6-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX6-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX6-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX6-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX6-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX6-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX6-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX6-NEXT: $vgpr0 = COPY [[COPY12]](s32)
+ ; GFX6-NEXT: $vgpr1 = COPY [[COPY13]](s32)
+ ; GFX6-NEXT: $vgpr2 = COPY [[COPY14]](s32)
+ ; GFX6-NEXT: $vgpr3 = COPY [[COPY15]](s32)
; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: load_2darraymsaa
@@ -50,10 +54,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i3
; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY12]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY13]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY14]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY15]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %v
@@ -84,10 +92,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr ad
; GFX6-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[BUILD_VECTOR1]](<4 x s32>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX6-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
; GFX6-NEXT: G_STORE [[UV4]](s32), [[MV]](p1) :: (store (s32) into %ir.out, addrspace 1)
- ; GFX6-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX6-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX6-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX6-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX6-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX6-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX6-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX6-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX6-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX6-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX6-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX6-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10NSA-LABEL: name: load_2darraymsaa_tfe
@@ -113,10 +125,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr ad
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
; GFX10NSA-NEXT: G_STORE [[UV4]](s32), [[MV]](p1) :: (store (s32) into %ir.out, addrspace 1)
- ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10NSA-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10NSA-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10NSA-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
%v.vec = extractvalue { <4 x float>, i32 } %v, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll
index 5b017ad89a0ed3..f0585516446840 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll
@@ -29,10 +29,14 @@ define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY13]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY14]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY15]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY16]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_1d
@@ -59,10 +63,14 @@ define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY13]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY14]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY15]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY16]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_1d
@@ -89,10 +97,14 @@ define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY13]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY14]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY15]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY16]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_1d
@@ -119,10 +131,14 @@ define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY13]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY14]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY15]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY16]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -155,10 +171,14 @@ define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_2d
@@ -186,10 +206,14 @@ define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_2d
@@ -217,10 +241,14 @@ define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_2d
@@ -248,10 +276,14 @@ define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -289,10 +321,14 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_3d
@@ -325,10 +361,14 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_3d
@@ -361,10 +401,14 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_3d
@@ -396,10 +440,14 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.3d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16(i32 15, half %s, half %t, half %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -437,10 +485,14 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_cube
@@ -473,10 +525,14 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_cube
@@ -509,10 +565,14 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_cube
@@ -544,10 +604,14 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cube), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f16(i32 15, half %s, half %t, half %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -580,10 +644,14 @@ define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1darray), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_1darray
@@ -611,10 +679,14 @@ define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1darray), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_1darray
@@ -642,10 +714,14 @@ define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1darray), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_1darray
@@ -673,10 +749,14 @@ define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1darray), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16(i32 15, half %s, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -714,10 +794,14 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_2darray
@@ -750,10 +834,14 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_2darray
@@ -786,10 +874,14 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_2darray
@@ -821,10 +913,14 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2darray), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16(i32 15, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -859,10 +955,14 @@ define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_c_1d
@@ -892,10 +992,14 @@ define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_1d
@@ -925,10 +1029,14 @@ define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_1d
@@ -957,10 +1065,14 @@ define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f16(i32 15, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -996,10 +1108,14 @@ define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_c_2d
@@ -1030,10 +1146,14 @@ define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_2d
@@ -1064,10 +1184,14 @@ define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_2d
@@ -1097,10 +1221,14 @@ define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1133,10 +1261,14 @@ define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_cl_1d
@@ -1164,10 +1296,14 @@ define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_cl_1d
@@ -1195,10 +1331,14 @@ define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_cl_1d
@@ -1226,10 +1366,14 @@ define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f16(i32 15, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1267,10 +1411,14 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_cl_2d
@@ -1303,10 +1451,14 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_cl_2d
@@ -1339,10 +1491,14 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_cl_2d
@@ -1374,10 +1530,14 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f16(i32 15, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1413,10 +1573,14 @@ define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_c_cl_1d
@@ -1447,10 +1611,14 @@ define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_cl_1d
@@ -1481,10 +1649,14 @@ define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_cl_1d
@@ -1514,10 +1686,14 @@ define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f16(i32 15, float %zcompare, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1557,10 +1733,14 @@ define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_c_cl_2d
@@ -1594,10 +1774,14 @@ define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_cl_2d
@@ -1631,10 +1815,14 @@ define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_cl_2d
@@ -1668,10 +1856,14 @@ define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1707,10 +1899,14 @@ define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_b_1d
@@ -1741,10 +1937,14 @@ define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_b_1d
@@ -1775,10 +1975,14 @@ define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_b_1d
@@ -1808,10 +2012,14 @@ define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f16.f16(i32 15, half %bias, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1849,10 +2057,14 @@ define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_b_2d
@@ -1885,10 +2097,14 @@ define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_b_2d
@@ -1921,10 +2137,14 @@ define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_b_2d
@@ -1956,10 +2176,14 @@ define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f16.f16(i32 15, half %bias, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1997,10 +2221,14 @@ define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_c_b_1d
@@ -2032,10 +2260,14 @@ define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_b_1d
@@ -2067,10 +2299,14 @@ define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_b_1d
@@ -2102,10 +2338,14 @@ define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -2145,10 +2385,14 @@ define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_c_b_2d
@@ -2182,10 +2426,14 @@ define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_b_2d
@@ -2219,10 +2467,14 @@ define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_b_2d
@@ -2256,10 +2508,14 @@ define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -2297,10 +2553,14 @@ define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_b_cl_1d
@@ -2333,10 +2593,14 @@ define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_b_cl_1d
@@ -2369,10 +2633,14 @@ define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_b_cl_1d
@@ -2404,10 +2672,14 @@ define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f16.f16(i32 15, half %bias, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -2448,10 +2720,14 @@ define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_b_cl_2d
@@ -2486,10 +2762,14 @@ define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[DEF]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_b_cl_2d
@@ -2524,10 +2804,14 @@ define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[DEF]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_b_cl_2d
@@ -2562,10 +2846,14 @@ define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f16.f16(i32 15, half %bias, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -2605,10 +2893,14 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_c_b_cl_1d
@@ -2642,10 +2934,14 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_b_cl_1d
@@ -2679,10 +2975,14 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_b_cl_1d
@@ -2716,10 +3016,14 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -2762,10 +3066,14 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.2d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY17]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY18]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY19]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY20]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_c_b_cl_2d
@@ -2802,10 +3110,14 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[DEF]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY18]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY19]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY20]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_b_cl_2d
@@ -2842,10 +3154,14 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[DEF]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY18]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY19]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY20]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_b_cl_2d
@@ -2882,10 +3198,14 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY18]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY19]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY20]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -2924,10 +3244,14 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_d_1d
@@ -2960,10 +3284,14 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_d_1d
@@ -2996,10 +3324,14 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_d_1d
@@ -3032,10 +3364,14 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -3079,10 +3415,14 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY18]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY19]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY20]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY21]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_d_2d
@@ -3120,10 +3460,14 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY18]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY19]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY20]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY21]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_d_2d
@@ -3161,10 +3505,14 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY18]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY19]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY20]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY21]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_d_2d
@@ -3202,10 +3550,14 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY18]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY19]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY20]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY21]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -3259,10 +3611,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY21]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY22]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY23]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY24]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_d_3d
@@ -3311,10 +3667,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY21]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY22]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY23]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY24]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_d_3d
@@ -3363,10 +3723,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY21]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY22]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY23]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY24]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_d_3d
@@ -3415,10 +3779,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY21]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY22]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY23]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY24]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, half %s, half %t, half %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -3459,10 +3827,14 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_c_d_1d
@@ -3497,10 +3869,14 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_d_1d
@@ -3535,10 +3911,14 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_d_1d
@@ -3573,10 +3953,14 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -3622,10 +4006,14 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_c_d_2d
@@ -3665,10 +4053,14 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_d_2d
@@ -3708,10 +4100,14 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_d_2d
@@ -3751,10 +4147,14 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -3795,10 +4195,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_d_cl_1d
@@ -3833,10 +4237,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_d_cl_1d
@@ -3871,10 +4279,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_d_cl_1d
@@ -3909,10 +4321,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -3960,10 +4376,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_d_cl_2d
@@ -4005,10 +4425,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_d_cl_2d
@@ -4050,10 +4474,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX11-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_d_cl_2d
@@ -4095,10 +4523,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX12-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -4141,10 +4573,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY17]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY18]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY19]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY20]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_c_d_cl_1d
@@ -4181,10 +4617,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY18]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY19]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY20]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_d_cl_1d
@@ -4221,10 +4661,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY18]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY19]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY20]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_d_cl_1d
@@ -4261,10 +4705,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY18]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY19]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY20]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -4314,10 +4762,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<10 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[CONCAT_VECTORS]](<10 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY20]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY21]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY22]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY23]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_c_d_cl_2d
@@ -4361,10 +4813,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY20]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY21]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY22]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY23]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_d_cl_2d
@@ -4408,10 +4864,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX11-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY20]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY21]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY22]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY23]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_d_cl_2d
@@ -4456,10 +4916,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY20]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY21]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY22]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY23]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -4498,10 +4962,14 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_cd_1d
@@ -4534,10 +5002,14 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_cd_1d
@@ -4570,10 +5042,14 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_cd_1d
@@ -4606,10 +5082,14 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -4653,10 +5133,14 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY18]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY19]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY20]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY21]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_cd_2d
@@ -4694,10 +5178,14 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY18]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY19]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY20]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY21]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_cd_2d
@@ -4735,10 +5223,14 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY18]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY19]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY20]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY21]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_cd_2d
@@ -4776,10 +5268,14 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY18]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY19]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY20]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY21]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -4820,10 +5316,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_c_cd_1d
@@ -4858,10 +5358,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_cd_1d
@@ -4896,10 +5400,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_cd_1d
@@ -4934,10 +5442,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -4983,10 +5495,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_c_cd_2d
@@ -5026,10 +5542,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_cd_2d
@@ -5069,10 +5589,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_cd_2d
@@ -5112,10 +5636,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -5156,10 +5684,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_cd_cl_1d
@@ -5194,10 +5726,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_cd_cl_1d
@@ -5232,10 +5768,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_cd_cl_1d
@@ -5270,10 +5810,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -5321,10 +5865,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_cd_cl_2d
@@ -5366,10 +5914,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_cd_cl_2d
@@ -5411,10 +5963,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX11-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_cd_cl_2d
@@ -5456,10 +6012,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX12-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -5502,10 +6062,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY17]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY18]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY19]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY20]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_c_cd_cl_1d
@@ -5542,10 +6106,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY18]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY19]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY20]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_cd_cl_1d
@@ -5582,10 +6150,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY18]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY19]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY20]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_cd_cl_1d
@@ -5622,10 +6194,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY18]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY19]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY20]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -5675,10 +6251,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<10 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[CONCAT_VECTORS]](<10 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY20]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY21]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY22]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY23]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_c_cd_cl_2d
@@ -5722,10 +6302,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY20]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY21]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY22]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY23]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_cd_cl_2d
@@ -5769,10 +6353,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX11-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY20]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY21]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY22]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY23]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_cd_cl_2d
@@ -5817,10 +6405,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY20]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY21]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY22]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY23]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -5853,10 +6445,14 @@ define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_l_1d
@@ -5884,10 +6480,14 @@ define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_l_1d
@@ -5915,10 +6515,14 @@ define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_l_1d
@@ -5946,10 +6550,14 @@ define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f16(i32 15, half %s, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -5987,10 +6595,14 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_l_2d
@@ -6023,10 +6635,14 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_l_2d
@@ -6059,10 +6675,14 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_l_2d
@@ -6094,10 +6714,14 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f16(i32 15, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -6133,10 +6757,14 @@ define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_c_l_1d
@@ -6167,10 +6795,14 @@ define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_l_1d
@@ -6201,10 +6833,14 @@ define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_l_1d
@@ -6234,10 +6870,14 @@ define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f16(i32 15, float %zcompare, half %s, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -6277,10 +6917,14 @@ define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_c_l_2d
@@ -6314,10 +6958,14 @@ define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_l_2d
@@ -6351,10 +6999,14 @@ define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_l_2d
@@ -6388,10 +7040,14 @@ define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -6423,10 +7079,14 @@ define amdgpu_ps <4 x float> @sample_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY13]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY14]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY15]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY16]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_lz_1d
@@ -6453,10 +7113,14 @@ define amdgpu_ps <4 x float> @sample_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY13]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY14]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY15]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY16]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_lz_1d
@@ -6483,10 +7147,14 @@ define amdgpu_ps <4 x float> @sample_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY13]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY14]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY15]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY16]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_lz_1d
@@ -6513,10 +7181,14 @@ define amdgpu_ps <4 x float> @sample_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY13]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY14]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY15]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY16]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f16(i32 15, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -6549,10 +7221,14 @@ define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_lz_2d
@@ -6580,10 +7256,14 @@ define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_lz_2d
@@ -6611,10 +7291,14 @@ define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_lz_2d
@@ -6642,10 +7326,14 @@ define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f16(i32 15, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -6680,10 +7368,14 @@ define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_c_lz_1d
@@ -6713,10 +7405,14 @@ define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_lz_1d
@@ -6746,10 +7442,14 @@ define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_lz_1d
@@ -6778,10 +7478,14 @@ define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY14]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY15]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY17]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f16(i32 15, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -6817,10 +7521,14 @@ define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX10-LABEL: name: sample_c_lz_2d
@@ -6851,10 +7559,14 @@ define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_lz_2d
@@ -6885,10 +7597,14 @@ define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_lz_2d
@@ -6918,10 +7634,14 @@ define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -7162,8 +7882,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>)
; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<2 x s32>), addrspace 8)
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
- ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY21]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY22]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
;
; GFX10-LABEL: name: sample_c_d_o_2darray_V2
@@ -7210,8 +7932,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<2 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY21]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY22]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
;
; GFX11-LABEL: name: sample_c_d_o_2darray_V2
@@ -7258,8 +7982,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<2 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY21]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY22]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
;
; GFX12-LABEL: name: sample_c_d_o_2darray_V2
@@ -7306,8 +8032,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<2 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY21]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY22]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
main_body:
%v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f16(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.d.ll
index 241170b94318a5..d7c1c7a6bef5ec 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.d.ll
@@ -34,10 +34,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<9 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR2]](<9 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY21]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY22]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY23]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY24]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_d_3d
@@ -70,10 +74,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR2]](<5 x s32>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY21]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY22]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY23]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY24]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_d_3d
@@ -106,10 +114,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<6 x s32>) = G_BUILD_VECTOR [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[BUILD_VECTOR2]](<6 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY21]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY22]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY23]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY24]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -148,10 +160,14 @@ define amdgpu_ps <4 x float> @sample_c_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<10 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.3d), 15, [[BUILD_VECTOR2]](<10 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY22]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY23]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY24]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY25]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_d_3d
@@ -185,10 +201,14 @@ define amdgpu_ps <4 x float> @sample_c_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<6 x s32>) = G_BUILD_VECTOR [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR2]](<6 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY22]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY23]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY24]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY25]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_d_3d
@@ -222,10 +242,14 @@ define amdgpu_ps <4 x float> @sample_c_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<7 x s32>) = G_BUILD_VECTOR [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[BUILD_VECTOR2]](<7 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY22]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY23]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY24]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY25]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.3d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -265,10 +289,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_3d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<11 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.3d), 15, [[BUILD_VECTOR2]](<11 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY23]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY24]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY25]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY26]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_d_cl_3d
@@ -303,10 +331,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_3d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<7 x s32>) = G_BUILD_VECTOR [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR2]](<7 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY23]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY24]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY25]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY26]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_d_cl_3d
@@ -341,10 +373,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_3d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[BUILD_VECTOR2]](<8 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY23]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY24]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY25]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY26]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.3d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, float %s, float %t, float %r, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -385,10 +421,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_o_3d(<8 x i32> inreg %rsrc, <4 x i32
; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<12 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.o.3d), 15, [[BUILD_VECTOR2]](<12 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY24]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY25]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY26]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY27]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_d_cl_o_3d
@@ -424,10 +464,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_o_3d(<8 x i32> inreg %rsrc, <4 x i32
; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.o.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR2]](<8 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY24]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY25]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY26]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY27]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_d_cl_o_3d
@@ -463,10 +507,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_o_3d(<8 x i32> inreg %rsrc, <4 x i32
; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<9 x s32>) = G_BUILD_VECTOR [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.o.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[BUILD_VECTOR2]](<9 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY24]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY25]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY26]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY27]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.3d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, float %s, float %t, float %r, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.a16.ll
index f05b258c974d1d..477965ab8981b6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.a16.ll
@@ -34,10 +34,14 @@ define amdgpu_ps <4 x float> @sample_d_1d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_d_1d_g16_a16
@@ -70,10 +74,14 @@ define amdgpu_ps <4 x float> @sample_d_1d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_d_1d_g16_a16
@@ -106,10 +114,14 @@ define amdgpu_ps <4 x float> @sample_d_1d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -152,10 +164,14 @@ define amdgpu_ps <4 x float> @sample_d_2d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY18]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY19]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY20]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY21]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_d_2d_g16_a16
@@ -193,10 +209,14 @@ define amdgpu_ps <4 x float> @sample_d_2d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY18]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY19]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY20]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY21]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_d_2d_g16_a16
@@ -234,10 +254,14 @@ define amdgpu_ps <4 x float> @sample_d_2d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY18]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY19]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY20]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY21]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -291,10 +315,14 @@ define amdgpu_ps <4 x float> @sample_d_3d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY21]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY22]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY23]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY24]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_d_3d_g16_a16
@@ -343,10 +371,14 @@ define amdgpu_ps <4 x float> @sample_d_3d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY21]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY22]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY23]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY24]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_d_3d_g16_a16
@@ -395,10 +427,14 @@ define amdgpu_ps <4 x float> @sample_d_3d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY21]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY22]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY23]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY24]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, half %s, half %t, half %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll
index cc2a8ba9c4d5d9..e78a9897be9c5a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll
@@ -33,10 +33,14 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_d_1d
@@ -68,10 +72,14 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_d_1d
@@ -103,10 +111,14 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -148,10 +160,14 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY18]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY19]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY20]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY21]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_d_2d
@@ -188,10 +204,14 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY18]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY19]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY20]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY21]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_d_2d
@@ -228,10 +248,14 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY18]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY19]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY20]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY21]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -283,10 +307,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[CONCAT_VECTORS]](<14 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY21]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY22]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY23]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY24]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_d_3d
@@ -333,10 +361,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY21]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY22]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY23]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY24]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_d_3d
@@ -383,10 +415,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR5]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY21]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY22]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY23]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY24]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -425,10 +461,14 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_d_1d
@@ -462,10 +502,14 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_d_1d
@@ -499,10 +543,14 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -546,10 +594,14 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_d_2d
@@ -588,10 +640,14 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_d_2d
@@ -631,10 +687,14 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -673,10 +733,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_d_cl_1d
@@ -710,10 +774,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_d_cl_1d
@@ -747,10 +815,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -794,10 +866,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_d_cl_2d
@@ -836,10 +912,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_d_cl_2d
@@ -879,10 +959,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -923,10 +1007,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY18]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY19]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY20]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_d_cl_1d
@@ -962,10 +1050,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY18]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY19]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY20]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_d_cl_1d
@@ -1002,10 +1094,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY18]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY19]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY20]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1052,10 +1148,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY20]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY21]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY22]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY23]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_d_cl_2d
@@ -1097,10 +1197,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY20]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY21]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY22]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY23]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_d_cl_2d
@@ -1142,10 +1246,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY20]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY21]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY22]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY23]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1182,10 +1290,14 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_cd_1d
@@ -1217,10 +1329,14 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_cd_1d
@@ -1252,10 +1368,14 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1297,10 +1417,14 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY18]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY19]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY20]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY21]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_cd_2d
@@ -1337,10 +1461,14 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY18]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY19]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY20]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY21]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_cd_2d
@@ -1377,10 +1505,14 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY18]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY19]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY20]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY21]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1419,10 +1551,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_cd_1d
@@ -1456,10 +1592,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_cd_1d
@@ -1493,10 +1633,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1540,10 +1684,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_cd_2d
@@ -1582,10 +1730,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_cd_2d
@@ -1625,10 +1777,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1667,10 +1823,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_cd_cl_1d
@@ -1704,10 +1864,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_cd_cl_1d
@@ -1741,10 +1905,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY16]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY18]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY19]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1788,10 +1956,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_cd_cl_2d
@@ -1830,10 +2002,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_cd_cl_2d
@@ -1873,10 +2049,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY19]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY20]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY21]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY22]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1917,10 +2097,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY17]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY18]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY19]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY20]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_cd_cl_1d
@@ -1956,10 +2140,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY17]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY18]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY19]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY20]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_cd_cl_1d
@@ -1996,10 +2184,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY17]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY18]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY19]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY20]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -2046,10 +2238,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY20]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY21]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[COPY22]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[COPY23]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX11-LABEL: name: sample_c_cd_cl_2d
@@ -2091,10 +2287,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY20]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY21]](s32)
+ ; GFX11-NEXT: $vgpr2 = COPY [[COPY22]](s32)
+ ; GFX11-NEXT: $vgpr3 = COPY [[COPY23]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
; GFX12-LABEL: name: sample_c_cd_cl_2d
@@ -2136,10 +2336,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY20]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY21]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY22]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY23]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -2322,8 +2526,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[CONCAT_VECTORS]](<14 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<2 x s32>), addrspace 8)
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
- ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10-NEXT: $vgpr0 = COPY [[COPY21]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[COPY22]](s32)
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
;
; GFX11-LABEL: name: sample_c_d_o_2darray_V2
@@ -2367,8 +2573,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<2 x s32>), addrspace 8)
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
- ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY21]](s32)
+ ; GFX11-NEXT: $vgpr1 = COPY [[COPY22]](s32)
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
;
; GFX12-LABEL: name: sample_c_d_o_2darray_V2
@@ -2412,8 +2620,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<2 x s32>), addrspace 8)
; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
- ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY21]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY22]](s32)
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
main_body:
%v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll
index 12234088adca65..67ff69a70c1ce4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll
@@ -242,12 +242,23 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; UNPACKED-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>)
- ; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; UNPACKED-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
- ; UNPACKED-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY11]](<2 x s16>)
+ ; UNPACKED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>)
+ ; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; UNPACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+ ; UNPACKED-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>)
+ ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; UNPACKED-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY12]], [[C]](s32)
+ ; UNPACKED-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+ ; UNPACKED-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST1]](<2 x s32>)
+ ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; UNPACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; UNPACKED-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY13]], [[C1]](s32)
+ ; UNPACKED-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+ ; UNPACKED-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST2]](<2 x s32>)
+ ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV5]](s32)
+ ; UNPACKED-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY14]], [[C]](s32)
; UNPACKED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
- ; UNPACKED-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST]](s32), [[LSHR]](s32), [[BITCAST1]](s32)
+ ; UNPACKED-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LSHR]](s32), [[LSHR1]](s32), [[LSHR2]](s32)
; UNPACKED-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<3 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8)
; UNPACKED-NEXT: S_ENDPGM 0
;
@@ -268,26 +279,37 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
; GFX81-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX81-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
; GFX81-NEXT: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX81-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>)
- ; GFX81-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; GFX81-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
- ; GFX81-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY11]](<2 x s16>)
+ ; GFX81-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>)
+ ; GFX81-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX81-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+ ; GFX81-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>)
+ ; GFX81-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX81-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY12]], [[C]](s32)
+ ; GFX81-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+ ; GFX81-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST1]](<2 x s32>)
+ ; GFX81-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX81-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; GFX81-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY13]], [[C1]](s32)
+ ; GFX81-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+ ; GFX81-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST2]](<2 x s32>)
+ ; GFX81-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV5]](s32)
+ ; GFX81-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY14]], [[C]](s32)
; GFX81-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
- ; GFX81-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
- ; GFX81-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
- ; GFX81-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
+ ; GFX81-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+ ; GFX81-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
+ ; GFX81-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C1]](s32)
; GFX81-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
- ; GFX81-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
- ; GFX81-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
- ; GFX81-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; GFX81-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32)
+ ; GFX81-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+ ; GFX81-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C2]]
+ ; GFX81-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+ ; GFX81-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C1]](s32)
; GFX81-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
- ; GFX81-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
- ; GFX81-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]]
- ; GFX81-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
- ; GFX81-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
- ; GFX81-NEXT: [[BITCAST5:%[0-9]+]]:_(<3 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>)
- ; GFX81-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[BITCAST5]](<3 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8)
+ ; GFX81-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
+ ; GFX81-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL1]]
+ ; GFX81-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
+ ; GFX81-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
+ ; GFX81-NEXT: [[BITCAST6:%[0-9]+]]:_(<3 x s32>) = G_BITCAST [[CONCAT_VECTORS1]](<6 x s16>)
+ ; GFX81-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[BITCAST6]](<3 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8)
; GFX81-NEXT: S_ENDPGM 0
;
; GFX9-LABEL: name: image_store_v3f16
@@ -308,8 +330,29 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>)
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY12]], [[C]](s32)
+ ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+ ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+ ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST1]](<2 x s32>)
+ ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY13]], [[C1]](s32)
+ ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+ ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+ ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST2]](<2 x s32>)
+ ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV5]](s32)
+ ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY14]], [[C]](s32)
+ ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
- ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8)
+ ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+ ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+ ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
+ ; GFX9-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
+ ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS1]](<4 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8)
; GFX9-NEXT: S_ENDPGM 0
;
; GFX10-LABEL: name: image_store_v3f16
@@ -330,8 +373,29 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>)
+ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>)
+ ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY12]], [[C]](s32)
+ ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+ ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+ ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST1]](<2 x s32>)
+ ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY13]], [[C1]](s32)
+ ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+ ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+ ; GFX10-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST2]](<2 x s32>)
+ ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV5]](s32)
+ ; GFX10-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY14]], [[C]](s32)
+ ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
- ; GFX10-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8)
+ ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+ ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+ ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
+ ; GFX10-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
+ ; GFX10-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS1]](<4 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8)
; GFX10-NEXT: S_ENDPGM 0
;
; GFX12-LABEL: name: image_store_v3f16
@@ -352,7 +416,28 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>)
- ; GFX12-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 7, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8)
+ ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY12]], [[C]](s32)
+ ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+ ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+ ; GFX12-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST1]](<2 x s32>)
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY13]], [[C1]](s32)
+ ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+ ; GFX12-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
+ ; GFX12-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST2]](<2 x s32>)
+ ; GFX12-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV5]](s32)
+ ; GFX12-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY14]], [[C]](s32)
+ ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
+ ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+ ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+ ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
+ ; GFX12-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+ ; GFX12-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS1]](<4 x s16>), 7, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8)
; GFX12-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
index 41e915a4c1011b..7c2c61deca375f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
@@ -375,10 +375,15 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) {
; GFX6-LABEL: abs_sgpr_v2i16:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_sext_i32_i16 s0, s0
; GFX6-NEXT: s_sext_i32_i16 s1, s1
-; GFX6-NEXT: s_abs_i32 s0, s0
+; GFX6-NEXT: s_sext_i32_i16 s0, s0
; GFX6-NEXT: s_abs_i32 s1, s1
+; GFX6-NEXT: s_abs_i32 s0, s0
+; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX6-NEXT: s_lshl_b32 s1, s1, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: abs_sgpr_v2i16:
@@ -415,6 +420,11 @@ define amdgpu_cs <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) {
; GFX6-NEXT: v_max_i32_e32 v0, v0, v2
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1
; GFX6-NEXT: v_max_i32_e32 v1, v1, v2
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-NEXT: v_readfirstlane_b32 s1, v1
; GFX6-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-legalize-range-metadata.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-legalize-range-metadata.ll
index b6b4301dadc7a5..9c2ac009e44a31 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-legalize-range-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-legalize-range-metadata.ll
@@ -14,16 +14,18 @@ define <4 x i8> @global_load_v4i8_align4__rangemd(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p1) :: (load (s32) from %ir.ptr, addrspace 1)
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C2]](s32)
- ; CHECK-NEXT: $vgpr0 = COPY [[LOAD]](s32)
- ; CHECK-NEXT: $vgpr1 = COPY [[LSHR]](s32)
- ; CHECK-NEXT: $vgpr2 = COPY [[LSHR1]](s32)
- ; CHECK-NEXT: $vgpr3 = COPY [[LSHR2]](s32)
+ ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+ ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C3]](s32)
+ ; CHECK-NEXT: $vgpr0 = COPY [[LSHR]](s32)
+ ; CHECK-NEXT: $vgpr1 = COPY [[LSHR1]](s32)
+ ; CHECK-NEXT: $vgpr2 = COPY [[LSHR2]](s32)
+ ; CHECK-NEXT: $vgpr3 = COPY [[LSHR3]](s32)
; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%load = load <4 x i8>, ptr addrspace(1) %ptr, align 4, !range !0, !noundef !1
ret <4 x i8> %load
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 5dd4fa0809131f..d2793000a31e2e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -712,6 +712,9 @@ define <2 x i16> @v_lshr_v2i16(<2 x i16> %value, <2 x i16> %amount) {
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_lshr_v2i16:
@@ -741,8 +744,11 @@ define <2 x i16> @v_lshr_v2i16_15(<2 x i16> %value) {
; GFX6-LABEL: v_lshr_v2i16_15:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_bfe_u32 v0, v0, 15, 1
; GFX6-NEXT: v_bfe_u32 v1, v1, 15, 1
+; GFX6-NEXT: v_bfe_u32 v0, v0, 15, 1
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_lshr_v2i16_15:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
index f426fb8954ed26..4d400d53916f16 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
@@ -2115,7 +2115,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %
; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: bb.2:
- ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
+ ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2172,7 +2172,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %
; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3
+ ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %41, %bb.3
; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2237,7 +2237,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %
; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: bb.2:
- ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.3
+ ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %43, %bb.3
; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2294,7 +2294,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %
; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3
+ ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %41, %bb.3
; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2357,7 +2357,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %
; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: bb.2:
- ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.3
+ ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %43, %bb.3
; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2414,7 +2414,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %
; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3
+ ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %41, %bb.3
; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2476,7 +2476,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: bb.2:
- ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
+ ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2534,7 +2534,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
+ ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2596,7 +2596,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: bb.2:
- ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
+ ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2654,7 +2654,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
+ ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2716,7 +2716,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: bb.2:
- ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
+ ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2774,7 +2774,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
+ ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2835,7 +2835,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4
; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: bb.2:
- ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
+ ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2891,7 +2891,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4
; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3
+ ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %41, %bb.3
; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index 168e6dfa5f147d..e361ebdf9b608e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -2750,25 +2750,32 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_min_i32_e32 v5, 0, v0
+; GFX6-NEXT: v_min_i32_e32 v6, 0, v0
+; GFX6-NEXT: v_bfrev_b32_e32 v7, 1
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_max_i32_e32 v4, 0, v0
-; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x80000000, v5
+; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v7, v6
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x7fffffff, v4
-; GFX6-NEXT: v_max_i32_e32 v2, v5, v2
+; GFX6-NEXT: v_max_i32_e32 v2, v6, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_min_i32_e32 v2, v2, v4
; GFX6-NEXT: v_min_i32_e32 v4, 0, v1
+; GFX6-NEXT: v_bfrev_b32_e32 v5, -2
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX6-NEXT: v_max_i32_e32 v3, 0, v1
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x80000000, v4
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x7fffffff, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v5, v3
; GFX6-NEXT: v_max_i32_e32 v2, v4, v2
; GFX6-NEXT: v_min_i32_e32 v2, v2, v3
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
-; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1
+; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_saddsat_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
index bac80f0777c024..d641913ada13da 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
@@ -636,8 +636,13 @@ define <2 x i16> @v_sext_inreg_v2i16_8(<2 x i16> %value) {
; GFX6-LABEL: v_sext_inreg_v2i16_8:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sext_inreg_v2i16_8:
@@ -673,8 +678,13 @@ define <2 x i16> @v_sext_inreg_v2i16_15(<2 x i16> %value) {
; GFX6-LABEL: v_sext_inreg_v2i16_15:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 1
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 1
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 1
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sext_inreg_v2i16_15:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
index 4cf1c92539c36f..08fc956f2dc45a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
@@ -720,6 +720,11 @@ define <2 x i16> @v_shl_v2i16(<2 x i16> %value, <2 x i16> %amount) {
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_shl_v2i16:
@@ -750,7 +755,10 @@ define <2 x i16> @v_shl_v2i16_15(<2 x i16> %value) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 15, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 15, v1
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 31, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_shl_v2i16_15:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 2572f8581f0edf..4d5a8cb6d69020 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -2753,22 +2753,29 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX6-NEXT: v_max_i32_e32 v4, -1, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4
-; GFX6-NEXT: v_min_i32_e32 v5, -1, v0
-; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x80000000, v5
+; GFX6-NEXT: v_min_i32_e32 v6, -1, v0
+; GFX6-NEXT: v_bfrev_b32_e32 v7, 1
+; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v7
; GFX6-NEXT: v_max_i32_e32 v2, v4, v2
-; GFX6-NEXT: v_min_i32_e32 v2, v2, v5
+; GFX6-NEXT: v_min_i32_e32 v2, v2, v6
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_bfrev_b32_e32 v5, -2
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX6-NEXT: v_max_i32_e32 v3, -1, v1
-; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x7fffffff, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v5
; GFX6-NEXT: v_min_i32_e32 v4, -1, v1
; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x80000000, v4
; GFX6-NEXT: v_max_i32_e32 v2, v3, v2
; GFX6-NEXT: v_min_i32_e32 v2, v2, v4
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
-; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1
+; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ssubsat_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index 788692c94b0cfa..a52e70a4cfc488 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -1872,8 +1872,9 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX6-NEXT: v_not_b32_e32 v3, v1
; GFX6-NEXT: v_min_u32_e32 v2, v3, v2
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uaddsat_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index 0042d34e235d17..1e3c6d1559dab1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -1784,8 +1784,9 @@ define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX6-NEXT: v_min_u32_e32 v2, v1, v2
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_usubsat_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index 9f093cc7b5abf2..d21e3e7165ef0c 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -9251,11 +9251,21 @@ define <4 x i16> @multi_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i
; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v0, v2
; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v1
; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v0, v1
+; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v0, v1
+; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v1, v5
+; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v7
; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v2, v4
-; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v5
+; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6
-; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
+; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v1
+; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: multi_use_mul_mad_v2i16_var:
@@ -9373,12 +9383,17 @@ define <2 x i16> @other_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i
; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3
; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2
; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v1
+; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5
; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0
+; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5
; GFX67-GISEL-NEXT: s_mov_b32 m0, -1
+; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX67-GISEL-NEXT: ds_write_b32 v6, v2
; GFX67-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index 4bed23487445a6..9661154a643815 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -6338,14 +6338,17 @@ define <2 x half> @v_exp_v2f16(<2 x half> %in) {
; SI-GISEL-LABEL: v_exp_v2f16:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_exp_v2f16:
@@ -6444,15 +6447,18 @@ define <2 x half> @v_exp_fabs_v2f16(<2 x half> %in) {
; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; SI-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0
-; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1
; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT: v_exp_f32_e32 v2, v0
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2
+; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_exp_fabs_v2f16:
@@ -6556,15 +6562,18 @@ define <2 x half> @v_exp_fneg_fabs_v2f16(<2 x half> %in) {
; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; SI-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0
-; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1
; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT: v_exp_f32_e32 v2, v0
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2
+; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_exp_fneg_fabs_v2f16:
@@ -6669,15 +6678,18 @@ define <2 x half> @v_exp_fneg_v2f16(<2 x half> %in) {
; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; SI-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0
-; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1
; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT: v_exp_f32_e32 v2, v0
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2
+; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_exp_fneg_v2f16:
@@ -6758,19 +6770,22 @@ define <2 x half> @v_exp_v2f16_fast(<2 x half> %in) {
; SI-GISEL-LABEL: v_exp_v2f16_fast:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 0x3dc5
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_exp_v2f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index ec7e52532cd327..045492aeed07bc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -6431,14 +6431,17 @@ define <2 x half> @v_exp10_v2f16(<2 x half> %in) {
; SI-GISEL-LABEL: v_exp10_v2f16:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_exp10_v2f16:
@@ -6537,15 +6540,18 @@ define <2 x half> @v_exp10_fabs_v2f16(<2 x half> %in) {
; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; SI-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0
-; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1
; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT: v_exp_f32_e32 v2, v0
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2
+; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_exp10_fabs_v2f16:
@@ -6649,15 +6655,18 @@ define <2 x half> @v_exp10_fneg_fabs_v2f16(<2 x half> %in) {
; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; SI-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0
-; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1
; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT: v_exp_f32_e32 v2, v0
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2
+; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_exp10_fneg_fabs_v2f16:
@@ -6762,15 +6771,18 @@ define <2 x half> @v_exp10_fneg_v2f16(<2 x half> %in) {
; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; SI-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0
-; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1
; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT: v_exp_f32_e32 v2, v0
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2
+; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_exp10_fneg_v2f16:
@@ -6852,19 +6864,22 @@ define <2 x half> @v_exp10_v2f16_fast(<2 x half> %in) {
; SI-GISEL-LABEL: v_exp10_v2f16_fast:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 0x3dc5
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_exp10_v2f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 32b599e63c61d2..1e520c1750f5f4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -2307,12 +2307,15 @@ define <2 x half> @v_exp2_v2f16(<2 x half> %in) {
; SI-GISEL-LABEL: v_exp2_v2f16:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_exp2_v2f16:
@@ -2384,12 +2387,15 @@ define <2 x half> @v_exp2_fabs_v2f16(<2 x half> %in) {
; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; SI-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_exp2_fabs_v2f16:
@@ -2468,12 +2474,15 @@ define <2 x half> @v_exp2_fneg_fabs_v2f16(<2 x half> %in) {
; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; SI-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0
; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_exp2_fneg_fabs_v2f16:
@@ -2553,12 +2562,15 @@ define <2 x half> @v_exp2_fneg_v2f16(<2 x half> %in) {
; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; SI-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_exp2_fneg_v2f16:
@@ -2628,12 +2640,15 @@ define <2 x half> @v_exp2_v2f16_fast(<2 x half> %in) {
; SI-GISEL-LABEL: v_exp2_v2f16_fast:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_exp2_v2f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
index b9fef0834cb245..fa85f0db33e2ed 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
@@ -236,9 +236,12 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
; GFX6-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, v3
; GFX6-GISEL-NEXT: v_frexp_exp_i32_f32_e32 v5, v1
; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: test_frexp_v2f16_v2i32:
@@ -323,8 +326,11 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
; GFX6-GISEL-NEXT: v_frexp_mant_f32_e32 v3, v1
; GFX6-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, v2
; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
index 72e86f1f6f9992..17b24ad2ee08ba 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
@@ -504,12 +504,15 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
; GFX6-GISEL-LABEL: test_ldexp_v2f16_v2i32:
; GFX6-GISEL: ; %bb.0:
; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v2
+; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v3
-; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v2
; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: test_ldexp_v2f16_v2i32:
@@ -638,8 +641,11 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v2
; GFX6-GISEL-NEXT: v_bfe_i32 v2, v3, 0, 16
; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v2
-; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: test_ldexp_v2f16_v2i16:
@@ -1087,18 +1093,24 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) {
; GFX6-GISEL-LABEL: test_ldexp_v4f16_v4i32:
; GFX6-GISEL: ; %bb.0:
; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v4
+; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v5
-; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v2, v2, v6
+; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v4
+; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v3, v3, v7
; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v2, v2, v6
; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX6-GISEL-NEXT: v_or_b32_e32 v2, v2, v1
+; GFX6-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: test_ldexp_v4f16_v4i32:
@@ -1292,11 +1304,17 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) {
; GFX6-GISEL-NEXT: v_bfe_i32 v4, v6, 0, 16
; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v2, v2, v4
; GFX6-GISEL-NEXT: v_bfe_i32 v4, v7, 0, 16
+; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v3, v3, v4
; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX6-GISEL-NEXT: v_or_b32_e32 v2, v2, v1
+; GFX6-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: test_ldexp_v4f16_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index 7f4cf19e9b85b4..897f0e9f024b2d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -6608,14 +6608,17 @@ define <2 x half> @v_log_v2f16(<2 x half> %in) {
; SI-GISEL-LABEL: v_log_v2f16:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log_v2f16:
@@ -6701,15 +6704,18 @@ define <2 x half> @v_log_fabs_v2f16(<2 x half> %in) {
; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; SI-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0
-; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_log_f32_e32 v2, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v2
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log_fabs_v2f16:
@@ -6827,15 +6833,18 @@ define <2 x half> @v_log_fneg_fabs_v2f16(<2 x half> %in) {
; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; SI-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0
-; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_log_f32_e32 v2, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v2
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log_fneg_fabs_v2f16:
@@ -6954,15 +6963,18 @@ define <2 x half> @v_log_fneg_v2f16(<2 x half> %in) {
; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; SI-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0
-; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_log_f32_e32 v2, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v2
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log_fneg_v2f16:
@@ -7072,14 +7084,17 @@ define <2 x half> @v_log_v2f16_fast(<2 x half> %in) {
; SI-GISEL-LABEL: v_log_v2f16_fast:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log_v2f16_fast:
@@ -7363,22 +7378,28 @@ define <4 x half> @v_log_v4f16(<4 x half> %in) {
; SI-GISEL-LABEL: v_log_v4f16:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3
-; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-GISEL-NEXT: v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_log_f32_e32 v2, v2
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_log_f32_e32 v3, v3
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
+; SI-GISEL-NEXT: v_log_f32_e32 v2, v2
; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1
-; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3f317218, v3
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log_v4f16:
@@ -7531,22 +7552,28 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) {
; SI-GISEL-LABEL: v_log_v4f16_fast:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3
-; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-GISEL-NEXT: v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_log_f32_e32 v2, v2
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_log_f32_e32 v3, v3
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
+; SI-GISEL-NEXT: v_log_f32_e32 v2, v2
; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1
-; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3f317218, v3
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 1c64e6b76c9577..74c56f5f22875e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -6608,14 +6608,17 @@ define <2 x half> @v_log10_v2f16(<2 x half> %in) {
; SI-GISEL-LABEL: v_log10_v2f16:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log10_v2f16:
@@ -6701,15 +6704,18 @@ define <2 x half> @v_log10_fabs_v2f16(<2 x half> %in) {
; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; SI-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0
-; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_log_f32_e32 v2, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v2
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log10_fabs_v2f16:
@@ -6827,15 +6833,18 @@ define <2 x half> @v_log10_fneg_fabs_v2f16(<2 x half> %in) {
; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; SI-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0
-; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_log_f32_e32 v2, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v2
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log10_fneg_fabs_v2f16:
@@ -6954,15 +6963,18 @@ define <2 x half> @v_log10_fneg_v2f16(<2 x half> %in) {
; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; SI-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0
-; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_log_f32_e32 v2, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v2
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log10_fneg_v2f16:
@@ -7072,14 +7084,17 @@ define <2 x half> @v_log10_v2f16_fast(<2 x half> %in) {
; SI-GISEL-LABEL: v_log10_v2f16_fast:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log10_v2f16_fast:
@@ -7363,22 +7378,28 @@ define <4 x half> @v_log10_v4f16(<4 x half> %in) {
; SI-GISEL-LABEL: v_log10_v4f16:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3
-; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-GISEL-NEXT: v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_log_f32_e32 v2, v2
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_log_f32_e32 v3, v3
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
+; SI-GISEL-NEXT: v_log_f32_e32 v2, v2
; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1
-; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v3
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log10_v4f16:
@@ -7531,22 +7552,28 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) {
; SI-GISEL-LABEL: v_log10_v4f16_fast:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3
-; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-GISEL-NEXT: v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_log_f32_e32 v2, v2
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_log_f32_e32 v3, v3
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
+; SI-GISEL-NEXT: v_log_f32_e32 v2, v2
; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1
-; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v3
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log10_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 50c52037dc4d31..87f46f6000961b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -3073,12 +3073,15 @@ define <2 x half> @v_log2_v2f16(<2 x half> %in) {
; SI-GISEL-LABEL: v_log2_v2f16:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log2_v2f16:
@@ -3161,12 +3164,15 @@ define <2 x half> @v_log2_fabs_v2f16(<2 x half> %in) {
; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; SI-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log2_fabs_v2f16:
@@ -3268,12 +3274,15 @@ define <2 x half> @v_log2_fneg_fabs_v2f16(<2 x half> %in) {
; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; SI-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0
; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log2_fneg_fabs_v2f16:
@@ -3376,12 +3385,15 @@ define <2 x half> @v_log2_fneg_v2f16(<2 x half> %in) {
; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; SI-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log2_fneg_v2f16:
@@ -3474,12 +3486,15 @@ define <2 x half> @v_log2_v2f16_fast(<2 x half> %in) {
; SI-GISEL-LABEL: v_log2_v2f16_fast:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log2_v2f16_fast:
@@ -3759,18 +3774,24 @@ define <4 x half> @v_log2_v4f16(<4 x half> %in) {
; SI-GISEL-LABEL: v_log2_v4f16:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3
-; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-GISEL-NEXT: v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_log_f32_e32 v2, v2
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_log_f32_e32 v3, v3
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_log_f32_e32 v2, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log2_v4f16:
@@ -3889,18 +3910,24 @@ define <4 x half> @v_log2_v4f16_fast(<4 x half> %in) {
; SI-GISEL-LABEL: v_log2_v4f16_fast:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3
-; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-GISEL-NEXT: v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_log_f32_e32 v2, v2
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_log_f32_e32 v3, v3
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_log_f32_e32 v2, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v1
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log2_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index 95d579be04ed27..267236e53b40bc 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -472,16 +472,19 @@ define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half
; GISEL-CI-LABEL: v_mad_mix_v2f32:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v4
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v5
+; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v5
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v4
+; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0
+; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext <2 x half> %src0 to <2 x float>
%src1.ext = fpext <2 x half> %src1 to <2 x float>
@@ -794,26 +797,32 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half
; GISEL-CI-LABEL: v_mad_mix_v4f32:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v7, v7
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v10, v10
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v11, v11
-; GISEL-CI-NEXT: v_mac_f32_e32 v8, v0, v4
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v10, v10
; GISEL-CI-NEXT: v_mac_f32_e32 v9, v1, v5
-; GISEL-CI-NEXT: v_mac_f32_e32 v10, v2, v6
+; GISEL-CI-NEXT: v_mac_f32_e32 v8, v0, v4
; GISEL-CI-NEXT: v_mac_f32_e32 v11, v3, v7
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v8
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v9
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v10
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v11
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v9
+; GISEL-CI-NEXT: v_mac_f32_e32 v10, v2, v6
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v8
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v11
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v10
+; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0
+; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GISEL-CI-NEXT: v_or_b32_e32 v2, v3, v1
+; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GISEL-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext <4 x half> %src0 to <4 x float>
%src1.ext = fpext <4 x half> %src1 to <4 x float>
@@ -909,30 +918,33 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s
; GISEL-CI-LABEL: v_mad_mix_v2f32_clamp_postcvt:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v5
; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v4
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
-; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v5
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v2
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v2
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext <2 x half> %src0 to <2 x float>
%src1.ext = fpext <2 x half> %src1 to <2 x float>
@@ -1322,52 +1334,58 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s
; GISEL-CI-LABEL: v_mad_mix_v4f32_clamp_postcvt:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v7, v7
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v9, v9
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v10, v10
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v11, v11
-; GISEL-CI-NEXT: v_mac_f32_e32 v8, v0, v4
; GISEL-CI-NEXT: v_mac_f32_e32 v9, v1, v5
+; GISEL-CI-NEXT: v_mac_f32_e32 v8, v0, v4
; GISEL-CI-NEXT: v_mac_f32_e32 v10, v2, v6
; GISEL-CI-NEXT: v_mac_f32_e32 v11, v3, v7
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v8
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v9
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v8
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v10
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v4, v11
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2
; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_max_f32_e32 v3, v3, v2
; GISEL-CI-NEXT: v_max_f32_e32 v2, v4, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v2
-; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v5
; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v5
-; GISEL-CI-NEXT: v_min_f32_e32 v2, v3, v5
-; GISEL-CI-NEXT: v_min_f32_e32 v3, v4, v5
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v5
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GISEL-CI-NEXT: v_min_f32_e32 v2, v2, v5
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT: v_min_f32_e32 v3, v3, v5
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GISEL-CI-NEXT: v_or_b32_e32 v2, v3, v1
+; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GISEL-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext <4 x half> %src0 to <4 x float>
%src1.ext = fpext <4 x half> %src1 to <4 x float>
@@ -1514,17 +1532,15 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v4
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GISEL-CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GISEL-CI-NEXT: v_or_b32_e32 v1, v1, v0
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v1
+; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1
-; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v2
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext <2 x half> %src0 to <2 x float>
%src1.ext = fpext <2 x half> %src1 to <2 x float>
@@ -1676,16 +1692,12 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0
; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1
-; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext <2 x half> %src0 to <2 x float>
%src1.ext = fpext <2 x half> %src1 to <2 x float>
@@ -1824,16 +1836,19 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr
; GISEL-CI-LABEL: v_mad_mix_v2f32_clamp_precvt:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GISEL-CI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; GISEL-CI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GISEL-CI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext <2 x half> %src0 to <2 x float>
%src1.ext = fpext <2 x half> %src1 to <2 x float>
@@ -2222,26 +2237,32 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
; GISEL-CI-LABEL: v_mad_mix_v4f32_clamp_precvt:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v7, v7
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v10, v10
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v11, v11
-; GISEL-CI-NEXT: v_mad_f32 v0, v0, v4, v8 clamp
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v10, v10
; GISEL-CI-NEXT: v_mad_f32 v1, v1, v5, v9 clamp
-; GISEL-CI-NEXT: v_mad_f32 v2, v2, v6, v10 clamp
+; GISEL-CI-NEXT: v_mad_f32 v0, v0, v4, v8 clamp
; GISEL-CI-NEXT: v_mad_f32 v3, v3, v7, v11 clamp
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GISEL-CI-NEXT: v_mad_f32 v2, v2, v6, v10 clamp
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GISEL-CI-NEXT: v_or_b32_e32 v2, v2, v1
+; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GISEL-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext <4 x half> %src0 to <4 x float>
%src1.ext = fpext <4 x half> %src1 to <4 x float>
diff --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll
index 0f95c0255d3abc..3015707418d0aa 100644
--- a/llvm/test/CodeGen/AMDGPU/roundeven.ll
+++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll
@@ -404,23 +404,29 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
; GFX6-LABEL: v_roundeven_v2f16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_rndne_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_rndne_f32_e32 v1, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_rndne_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_roundeven_v2f16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_rndne_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_rndne_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_rndne_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_roundeven_v2f16:
@@ -522,13 +528,16 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX6-NEXT: v_rndne_f32_e32 v0, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_rndne_f32_e32 v1, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_rndne_f32_e32 v1, v1
+; GFX6-NEXT: v_rndne_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_roundeven_v2f16_fneg:
@@ -538,13 +547,16 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX7-NEXT: v_rndne_f32_e32 v0, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_rndne_f32_e32 v1, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_rndne_f32_e32 v1, v1
+; GFX7-NEXT: v_rndne_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_roundeven_v2f16_fneg:
@@ -655,35 +667,47 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
; GFX6-LABEL: v_roundeven_v4f16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_rndne_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_rndne_f32_e32 v1, v1
-; GFX6-NEXT: v_rndne_f32_e32 v2, v2
+; GFX6-NEXT: v_rndne_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_rndne_f32_e32 v3, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_rndne_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_roundeven_v4f16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_rndne_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_rndne_f32_e32 v1, v1
-; GFX7-NEXT: v_rndne_f32_e32 v2, v2
+; GFX7-NEXT: v_rndne_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_rndne_f32_e32 v3, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_rndne_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_roundeven_v4f16:
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
index d9e0e0298e072f..6dedc6920a30e7 100644
--- a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
@@ -11,11 +11,11 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY killed $sgpr1
; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:sreg_64 = COPY killed [[COPY]]
; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:sreg_64 = COPY killed [[COPY1]]
- ; CHECK-NEXT: early-clobber %11:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY2]], 0, 0 :: (invariant load (<2 x s32>) from %ir.ptr, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %17:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY2]], 0, 0 :: (invariant load (<2 x s32>) from %ir.ptr, align 4, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[COPY2]], 8, 0 :: (invariant load (s32) from %ir.ptr + 8, addrspace 4)
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %11.sub0
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %17.sub0
; CHECK-NEXT: $sgpr0 = COPY killed [[COPY3]]
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY killed %11.sub1
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY killed %17.sub1
; CHECK-NEXT: $sgpr1 = COPY killed [[COPY4]]
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORD_IMM]]
; CHECK-NEXT: $sgpr2 = COPY killed [[COPY5]]
>From 55aa772009b50313f7acfbfe586d32c117c1af37 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Sat, 19 Oct 2024 18:52:53 +0200
Subject: [PATCH 2/2] second round
---
.../GlobalISel/CombinerHelperArtifacts.cpp | 2 +-
llvm/lib/Target/AArch64/AArch64Combine.td | 2 +-
.../AArch64/GlobalISel/combine-unmerge.mir | 17 +-
llvm/test/CodeGen/AArch64/abs.ll | 91 +----
llvm/test/CodeGen/AArch64/bitcast.ll | 8 +-
llvm/test/CodeGen/AArch64/bswap.ll | 27 +-
llvm/test/CodeGen/AArch64/fabs.ll | 34 +-
llvm/test/CodeGen/AArch64/faddsub.ll | 132 ++-----
llvm/test/CodeGen/AArch64/fcmp.ll | 298 +++++---------
llvm/test/CodeGen/AArch64/fcopysign.ll | 55 +--
llvm/test/CodeGen/AArch64/fcvt.ll | 371 +++++-------------
llvm/test/CodeGen/AArch64/fdiv.ll | 66 +---
llvm/test/CodeGen/AArch64/fexplog.ll | 40 +-
llvm/test/CodeGen/AArch64/fminimummaximum.ll | 132 ++-----
llvm/test/CodeGen/AArch64/fminmax.ll | 132 ++-----
llvm/test/CodeGen/AArch64/fmla.ll | 248 +++---------
llvm/test/CodeGen/AArch64/fmul.ll | 66 +---
llvm/test/CodeGen/AArch64/fneg.ll | 32 +-
llvm/test/CodeGen/AArch64/fpow.ll | 8 +-
llvm/test/CodeGen/AArch64/fpowi.ll | 8 +-
.../test/CodeGen/AArch64/fptosi-sat-vector.ll | 161 ++++----
.../test/CodeGen/AArch64/fptoui-sat-vector.ll | 138 +++----
llvm/test/CodeGen/AArch64/frem.ll | 8 +-
llvm/test/CodeGen/AArch64/fsincos.ll | 16 +-
llvm/test/CodeGen/AArch64/fsqrt.ll | 49 +--
llvm/test/CodeGen/AArch64/load.ll | 69 ++--
llvm/test/CodeGen/AArch64/shift.ll | 348 ++--------------
llvm/test/CodeGen/AArch64/shufflevector.ll | 158 +-------
.../regbankselect-amdgcn.s.buffer.load.ll | 28 +-
29 files changed, 661 insertions(+), 2083 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
index 805d34ae0493c4..cab250ee7e62fa 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
@@ -113,7 +113,7 @@ bool CombinerHelper::matchUnmergeValuesAnyExtBuildVector(const MachineInstr &MI,
bool CombinerHelper::matchUnmergeValuesOfScalarAndVector(const MachineInstr &MI,
BuildFnTy &MatchInfo) {
- constexpr unsigned MAX_NUM_DEFS_LIMIT = 8;
+ constexpr unsigned MAX_NUM_DEFS_LIMIT = 4;
// %opaque:_(<2 x s64>) = G_OPAQUE
// %un1:_(s64), %un2:_(s64) = G_UNMERGE_VALUES %opaque(<2 x s64>)
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 1eb7488e4ff570..8af8cdfeba6ac4 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -322,7 +322,7 @@ def AArch64PostLegalizerCombiner
extractvecelt_pairwise_add, redundant_or,
mul_const, redundant_sext_inreg,
form_bitfield_extract, rotate_out_of_range,
- icmp_to_true_false_known_bits, vector_ops_combines,
+ icmp_to_true_false_known_bits,
select_combines, fold_merge_to_zext,
constant_fold_binops, identity_combines,
ptr_add_immed_chain, overlapping_and,
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
index fc7584a2e1b162..e401cebd93a924 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
@@ -585,22 +585,7 @@ body: |
bb.1:
; CHECK-LABEL: name: test_long_opaque_vector_scalar
; CHECK: %opaque:_(<8 x s16>) = COPY $q0
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
- ; CHECK-NEXT: %un1:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C]](s64)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
- ; CHECK-NEXT: %un2:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C1]](s64)
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
- ; CHECK-NEXT: %un3:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C2]](s64)
- ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
- ; CHECK-NEXT: %un4:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C3]](s64)
- ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
- ; CHECK-NEXT: %un5:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C4]](s64)
- ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
- ; CHECK-NEXT: %un6:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C5]](s64)
- ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
- ; CHECK-NEXT: %un7:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C6]](s64)
- ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 7
- ; CHECK-NEXT: %un8:_(s16) = G_EXTRACT_VECTOR_ELT %opaque(<8 x s16>), [[C7]](s64)
+ ; CHECK-NEXT: %un1:_(s16), %un2:_(s16), %un3:_(s16), %un4:_(s16), %un5:_(s16), %un6:_(s16), %un7:_(s16), %un8:_(s16) = G_UNMERGE_VALUES %opaque(<8 x s16>)
; CHECK-NEXT: %zext1:_(s32) = G_ZEXT %un1(s16)
; CHECK-NEXT: %zext2:_(s32) = G_ZEXT %un2(s16)
; CHECK-NEXT: %zext3:_(s32) = G_ZEXT %un3(s16)
diff --git a/llvm/test/CodeGen/AArch64/abs.ll b/llvm/test/CodeGen/AArch64/abs.ll
index b5794007bdddb0..29fe2d02a93e11 100644
--- a/llvm/test/CodeGen/AArch64/abs.ll
+++ b/llvm/test/CodeGen/AArch64/abs.ll
@@ -355,66 +355,10 @@ entry:
declare <3 x i8> @llvm.abs.v3i8(<3 x i8>, i1)
define <7 x i8> @abs_v7i8(<7 x i8> %a){
-; CHECK-SD-LABEL: abs_v7i8:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: abs v0.8b, v0.8b
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: abs_v7i8:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov b1, v0.b[1]
-; CHECK-GI-NEXT: mov v2.b[0], v0.b[0]
-; CHECK-GI-NEXT: mov b3, v0.b[2]
-; CHECK-GI-NEXT: mov v2.b[1], v1.b[0]
-; CHECK-GI-NEXT: mov b1, v0.b[3]
-; CHECK-GI-NEXT: mov v2.b[2], v3.b[0]
-; CHECK-GI-NEXT: mov b3, v0.b[4]
-; CHECK-GI-NEXT: mov v2.b[3], v1.b[0]
-; CHECK-GI-NEXT: mov b1, v0.b[5]
-; CHECK-GI-NEXT: mov b0, v0.b[6]
-; CHECK-GI-NEXT: mov v2.b[4], v3.b[0]
-; CHECK-GI-NEXT: mov v2.b[5], v1.b[0]
-; CHECK-GI-NEXT: mov v2.b[6], v0.b[0]
-; CHECK-GI-NEXT: abs v0.8b, v2.8b
-; CHECK-GI-NEXT: mov b1, v0.b[1]
-; CHECK-GI-NEXT: mov b2, v0.b[2]
-; CHECK-GI-NEXT: mov b3, v0.b[3]
-; CHECK-GI-NEXT: mov b4, v0.b[4]
-; CHECK-GI-NEXT: mov b5, v0.b[6]
-; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: mov b1, v0.b[5]
-; CHECK-GI-NEXT: mov v0.h[1], w8
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov v0.h[2], w8
-; CHECK-GI-NEXT: fmov w8, s3
-; CHECK-GI-NEXT: mov v0.h[3], w8
-; CHECK-GI-NEXT: fmov w8, s4
-; CHECK-GI-NEXT: mov v0.h[4], w8
-; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: mov v0.h[5], w8
-; CHECK-GI-NEXT: fmov w8, s5
-; CHECK-GI-NEXT: mov v0.h[6], w8
-; CHECK-GI-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NEXT: mov h2, v0.h[3]
-; CHECK-GI-NEXT: mov h3, v0.h[4]
-; CHECK-GI-NEXT: mov h4, v0.h[5]
-; CHECK-GI-NEXT: mov h5, v0.h[6]
-; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: mov h1, v0.h[2]
-; CHECK-GI-NEXT: mov v0.b[1], w8
-; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: mov v0.b[2], w8
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov v0.b[3], w8
-; CHECK-GI-NEXT: fmov w8, s3
-; CHECK-GI-NEXT: mov v0.b[4], w8
-; CHECK-GI-NEXT: fmov w8, s4
-; CHECK-GI-NEXT: mov v0.b[5], w8
-; CHECK-GI-NEXT: fmov w8, s5
-; CHECK-GI-NEXT: mov v0.b[6], w8
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: abs_v7i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: abs v0.8b, v0.8b
+; CHECK-NEXT: ret
entry:
%res = call <7 x i8> @llvm.abs.v7i8(<7 x i8> %a, i1 0)
ret <7 x i8> %res
@@ -453,29 +397,10 @@ entry:
declare <3 x i16> @llvm.abs.v3i16(<3 x i16>, i1)
define <7 x i16> @abs_v7i16(<7 x i16> %a){
-; CHECK-SD-LABEL: abs_v7i16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: abs v0.8h, v0.8h
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: abs_v7i16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
-; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-GI-NEXT: mov v1.h[3], v0.h[3]
-; CHECK-GI-NEXT: mov v1.h[4], v0.h[4]
-; CHECK-GI-NEXT: mov v1.h[5], v0.h[5]
-; CHECK-GI-NEXT: mov v1.h[6], v0.h[6]
-; CHECK-GI-NEXT: abs v1.8h, v1.8h
-; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: abs_v7i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: abs v0.8h, v0.8h
+; CHECK-NEXT: ret
entry:
%res = call <7 x i16> @llvm.abs.v7i16(<7 x i16> %a, i1 0)
ret <7 x i16> %res
diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll
index 8449b69a473d92..e34bac2e2fa69a 100644
--- a/llvm/test/CodeGen/AArch64/bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast.ll
@@ -647,13 +647,7 @@ define <6 x i16> @bitcast_v3i32_v6i16(<3 x i32> %a, <3 x i32> %b){
; CHECK-GI-NEXT: mov v3.s[1], v1.s[1]
; CHECK-GI-NEXT: mov v2.s[2], v0.s[2]
; CHECK-GI-NEXT: mov v3.s[2], v1.s[2]
-; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
-; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
+; CHECK-GI-NEXT: add v0.4s, v2.4s, v3.4s
; CHECK-GI-NEXT: ret
%c = add <3 x i32> %a, %b
%d = bitcast <3 x i32> %c to <6 x i16>
diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll
index 9f9653fcbb50b5..fd1ac47bef7d15 100644
--- a/llvm/test/CodeGen/AArch64/bswap.ll
+++ b/llvm/test/CodeGen/AArch64/bswap.ll
@@ -277,29 +277,10 @@ entry:
declare <3 x i16> @llvm.bswap.v3i16(<3 x i16>)
define <7 x i16> @bswap_v7i16(<7 x i16> %a){
-; CHECK-SD-LABEL: bswap_v7i16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: rev16 v0.16b, v0.16b
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: bswap_v7i16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
-; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-GI-NEXT: mov v1.h[3], v0.h[3]
-; CHECK-GI-NEXT: mov v1.h[4], v0.h[4]
-; CHECK-GI-NEXT: mov v1.h[5], v0.h[5]
-; CHECK-GI-NEXT: mov v1.h[6], v0.h[6]
-; CHECK-GI-NEXT: rev16 v1.16b, v1.16b
-; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: bswap_v7i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: rev16 v0.16b, v0.16b
+; CHECK-NEXT: ret
entry:
%res = call <7 x i16> @llvm.bswap.v7i16(<7 x i16> %a)
ret <7 x i16> %res
diff --git a/llvm/test/CodeGen/AArch64/fabs.ll b/llvm/test/CodeGen/AArch64/fabs.ll
index 1aed6cb8bf9ed8..0e1f9fba307add 100644
--- a/llvm/test/CodeGen/AArch64/fabs.ll
+++ b/llvm/test/CodeGen/AArch64/fabs.ll
@@ -174,41 +174,13 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
;
; CHECK-GI-NOFP16-LABEL: fabs_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[4], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[5], v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[6], v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mvni v0.8h, #128, lsl #8
-; CHECK-GI-NOFP16-NEXT: and v1.16b, v1.16b, v0.16b
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-NOFP16-NEXT: mvni v1.8h, #128, lsl #8
+; CHECK-GI-NOFP16-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fabs_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT: mov v1.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT: mov v1.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT: mov v1.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT: mov v1.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT: fabs v1.8h, v1.8h
-; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT: fabs v0.8h, v0.8h
; CHECK-GI-FP16-NEXT: ret
entry:
%c = call <7 x half> @llvm.fabs.v7f16(<7 x half> %a)
diff --git a/llvm/test/CodeGen/AArch64/faddsub.ll b/llvm/test/CodeGen/AArch64/faddsub.ll
index 4227c891d844f4..de9a458a98b60f 100644
--- a/llvm/test/CodeGen/AArch64/faddsub.ll
+++ b/llvm/test/CodeGen/AArch64/faddsub.ll
@@ -201,68 +201,32 @@ define <7 x half> @fadd_v7f16(<7 x half> %a, <7 x half> %b) {
;
; CHECK-GI-NOFP16-LABEL: fadd_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h
; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
; CHECK-GI-NOFP16-NEXT: fadd v2.4s, v2.4s, v3.4s
; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6]
; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v4.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v0.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[4], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[5], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[6], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v1.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fadd_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: mov v2.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT: mov v3.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT: mov v2.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT: mov v2.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT: mov v2.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT: mov v2.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT: mov v3.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT: mov v2.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT: mov v3.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT: mov v2.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT: mov v3.h[6], v1.h[6]
-; CHECK-GI-FP16-NEXT: fadd v1.8h, v2.8h, v3.8h
-; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT: fadd v0.8h, v0.8h, v1.8h
; CHECK-GI-FP16-NEXT: ret
entry:
%c = fadd <7 x half> %a, %b
@@ -593,68 +557,32 @@ define <7 x half> @fsub_v7f16(<7 x half> %a, <7 x half> %b) {
;
; CHECK-GI-NOFP16-LABEL: fsub_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h
; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
; CHECK-GI-NOFP16-NEXT: fsub v2.4s, v2.4s, v3.4s
; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6]
; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v4.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT: fsub v0.4s, v0.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[4], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[5], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[6], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: fsub v1.4s, v1.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fsub_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: mov v2.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT: mov v3.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT: mov v2.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT: mov v2.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT: mov v2.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT: mov v2.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT: mov v3.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT: mov v2.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT: mov v3.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT: mov v2.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT: mov v3.h[6], v1.h[6]
-; CHECK-GI-FP16-NEXT: fsub v1.8h, v2.8h, v3.8h
-; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT: fsub v0.8h, v0.8h, v1.8h
; CHECK-GI-FP16-NEXT: ret
entry:
%c = fsub <7 x half> %a, %b
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index 584ffa92493d08..c1459ac5b56434 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -1245,134 +1245,70 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x
;
; CHECK-GI-NOFP16-LABEL: v7f16_half:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v1.h[0]
; CHECK-GI-NOFP16-NEXT: mov w8, #15 // =0xf
-; CHECK-GI-NOFP16-NEXT: fmov s6, w8
-; CHECK-GI-NOFP16-NEXT: mov v17.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v6.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT: fmov s5, w8
; CHECK-GI-NOFP16-NEXT: mov w9, #65535 // =0xffff
-; CHECK-GI-NOFP16-NEXT: mov v16.h[0], v1.h[4]
; CHECK-GI-NOFP16-NEXT: fmov s7, w9
-; CHECK-GI-NOFP16-NEXT: mov v18.h[0], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v19.h[0], v3.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v6.h[1], w8
-; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v17.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[1], w8
+; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v1.h[5]
; CHECK-GI-NOFP16-NEXT: mov v7.h[1], w9
-; CHECK-GI-NOFP16-NEXT: mov v16.h[1], v1.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v18.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v19.h[1], v3.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v6.h[2], w8
-; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v17.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[2], w8
+; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v1.h[6]
; CHECK-GI-NOFP16-NEXT: mov v7.h[2], w9
-; CHECK-GI-NOFP16-NEXT: mov v16.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v18.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v19.h[2], v3.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v5.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v6.h[3], w8
-; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v17.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT: mov v5.h[3], w8
+; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v6.4s, v6.4h
; CHECK-GI-NOFP16-NEXT: mov v7.h[3], w9
-; CHECK-GI-NOFP16-NEXT: mov v18.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v19.h[3], v3.h[3]
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v5.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v5.4s, v16.4h
-; CHECK-GI-NOFP16-NEXT: mov v6.h[4], w8
+; CHECK-GI-NOFP16-NEXT: fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov v5.h[4], w8
+; CHECK-GI-NOFP16-NEXT: fcmgt v1.4s, v6.4s, v4.4s
; CHECK-GI-NOFP16-NEXT: mov v7.h[4], w9
-; CHECK-GI-NOFP16-NEXT: mov v18.h[4], v2.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v19.h[4], v3.h[4]
-; CHECK-GI-NOFP16-NEXT: fcmgt v1.4s, v4.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: fcmgt v0.4s, v5.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT: mov v6.h[5], w8
+; CHECK-GI-NOFP16-NEXT: mov v5.h[5], w8
+; CHECK-GI-NOFP16-NEXT: uzp1 v0.8h, v0.8h, v1.8h
; CHECK-GI-NOFP16-NEXT: mov v7.h[5], w9
-; CHECK-GI-NOFP16-NEXT: mov v18.h[5], v2.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v19.h[5], v3.h[5]
-; CHECK-GI-NOFP16-NEXT: uzp1 v0.8h, v1.8h, v0.8h
-; CHECK-GI-NOFP16-NEXT: mov v6.h[6], w8
+; CHECK-GI-NOFP16-NEXT: mov v5.h[6], w8
; CHECK-GI-NOFP16-NEXT: mov v7.h[6], w9
-; CHECK-GI-NOFP16-NEXT: mov v18.h[6], v2.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v19.h[6], v3.h[6]
-; CHECK-GI-NOFP16-NEXT: ushl v0.8h, v0.8h, v6.8h
-; CHECK-GI-NOFP16-NEXT: neg v1.8h, v6.8h
+; CHECK-GI-NOFP16-NEXT: ushl v0.8h, v0.8h, v5.8h
+; CHECK-GI-NOFP16-NEXT: neg v1.8h, v5.8h
; CHECK-GI-NOFP16-NEXT: sshl v0.8h, v0.8h, v1.8h
; CHECK-GI-NOFP16-NEXT: eor v1.16b, v0.16b, v7.16b
-; CHECK-GI-NOFP16-NEXT: and v0.16b, v18.16b, v0.16b
-; CHECK-GI-NOFP16-NEXT: and v1.16b, v19.16b, v1.16b
-; CHECK-GI-NOFP16-NEXT: orr v1.16b, v0.16b, v1.16b
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-NOFP16-NEXT: and v0.16b, v2.16b, v0.16b
+; CHECK-GI-NOFP16-NEXT: and v1.16b, v3.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: v7f16_half:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: mov v4.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT: mov v5.h[0], v1.h[0]
; CHECK-GI-FP16-NEXT: mov w8, #15 // =0xf
-; CHECK-GI-FP16-NEXT: fmov s6, w8
; CHECK-GI-FP16-NEXT: mov w9, #65535 // =0xffff
-; CHECK-GI-FP16-NEXT: mov v16.h[0], v2.h[0]
-; CHECK-GI-FP16-NEXT: fmov s7, w9
-; CHECK-GI-FP16-NEXT: mov v17.h[0], v3.h[0]
-; CHECK-GI-FP16-NEXT: mov v4.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT: mov v5.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT: mov v6.h[1], w8
-; CHECK-GI-FP16-NEXT: mov v7.h[1], w9
-; CHECK-GI-FP16-NEXT: mov v16.h[1], v2.h[1]
-; CHECK-GI-FP16-NEXT: mov v17.h[1], v3.h[1]
-; CHECK-GI-FP16-NEXT: mov v4.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT: mov v5.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT: mov v6.h[2], w8
-; CHECK-GI-FP16-NEXT: mov v7.h[2], w9
-; CHECK-GI-FP16-NEXT: mov v16.h[2], v2.h[2]
-; CHECK-GI-FP16-NEXT: mov v17.h[2], v3.h[2]
-; CHECK-GI-FP16-NEXT: mov v4.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT: mov v5.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT: mov v6.h[3], w8
-; CHECK-GI-FP16-NEXT: mov v7.h[3], w9
-; CHECK-GI-FP16-NEXT: mov v16.h[3], v2.h[3]
-; CHECK-GI-FP16-NEXT: mov v17.h[3], v3.h[3]
-; CHECK-GI-FP16-NEXT: mov v4.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT: mov v5.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT: mov v6.h[4], w8
-; CHECK-GI-FP16-NEXT: mov v7.h[4], w9
-; CHECK-GI-FP16-NEXT: mov v16.h[4], v2.h[4]
-; CHECK-GI-FP16-NEXT: mov v17.h[4], v3.h[4]
-; CHECK-GI-FP16-NEXT: mov v4.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT: mov v5.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT: mov v6.h[5], w8
-; CHECK-GI-FP16-NEXT: mov v7.h[5], w9
-; CHECK-GI-FP16-NEXT: mov v16.h[5], v2.h[5]
-; CHECK-GI-FP16-NEXT: mov v17.h[5], v3.h[5]
-; CHECK-GI-FP16-NEXT: mov v4.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT: mov v5.h[6], v1.h[6]
-; CHECK-GI-FP16-NEXT: mov v6.h[6], w8
-; CHECK-GI-FP16-NEXT: mov v7.h[6], w9
-; CHECK-GI-FP16-NEXT: mov v16.h[6], v2.h[6]
-; CHECK-GI-FP16-NEXT: mov v17.h[6], v3.h[6]
-; CHECK-GI-FP16-NEXT: fcmgt v0.8h, v5.8h, v4.8h
-; CHECK-GI-FP16-NEXT: neg v1.8h, v6.8h
-; CHECK-GI-FP16-NEXT: ushl v0.8h, v0.8h, v6.8h
+; CHECK-GI-FP16-NEXT: fcmgt v0.8h, v1.8h, v0.8h
+; CHECK-GI-FP16-NEXT: fmov s4, w8
+; CHECK-GI-FP16-NEXT: fmov s5, w9
+; CHECK-GI-FP16-NEXT: mov v4.h[1], w8
+; CHECK-GI-FP16-NEXT: mov v5.h[1], w9
+; CHECK-GI-FP16-NEXT: mov v4.h[2], w8
+; CHECK-GI-FP16-NEXT: mov v5.h[2], w9
+; CHECK-GI-FP16-NEXT: mov v4.h[3], w8
+; CHECK-GI-FP16-NEXT: mov v5.h[3], w9
+; CHECK-GI-FP16-NEXT: mov v4.h[4], w8
+; CHECK-GI-FP16-NEXT: mov v5.h[4], w9
+; CHECK-GI-FP16-NEXT: mov v4.h[5], w8
+; CHECK-GI-FP16-NEXT: mov v5.h[5], w9
+; CHECK-GI-FP16-NEXT: mov v4.h[6], w8
+; CHECK-GI-FP16-NEXT: mov v5.h[6], w9
+; CHECK-GI-FP16-NEXT: ushl v0.8h, v0.8h, v4.8h
+; CHECK-GI-FP16-NEXT: neg v1.8h, v4.8h
; CHECK-GI-FP16-NEXT: sshl v0.8h, v0.8h, v1.8h
-; CHECK-GI-FP16-NEXT: eor v1.16b, v0.16b, v7.16b
-; CHECK-GI-FP16-NEXT: and v0.16b, v16.16b, v0.16b
-; CHECK-GI-FP16-NEXT: and v1.16b, v17.16b, v1.16b
-; CHECK-GI-FP16-NEXT: orr v1.16b, v0.16b, v1.16b
-; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT: eor v1.16b, v0.16b, v5.16b
+; CHECK-GI-FP16-NEXT: and v0.16b, v2.16b, v0.16b
+; CHECK-GI-FP16-NEXT: and v1.16b, v3.16b, v1.16b
+; CHECK-GI-FP16-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-GI-FP16-NEXT: ret
entry:
%c = fcmp olt <7 x half> %a, %b
@@ -1795,69 +1731,61 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
;
; CHECK-GI-NOFP16-LABEL: v7f16_i32:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[4]
; CHECK-GI-NOFP16-NEXT: mov w8, #31 // =0x1f
-; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v4.s[0], w8
; CHECK-GI-NOFP16-NEXT: mov w9, #-1 // =0xffffffff
-; CHECK-GI-NOFP16-NEXT: mov v6.s[0], w8
-; CHECK-GI-NOFP16-NEXT: mov v16.s[0], w9
-; CHECK-GI-NOFP16-NEXT: ldr s18, [sp]
-; CHECK-GI-NOFP16-NEXT: mov v7.s[0], w0
-; CHECK-GI-NOFP16-NEXT: mov v17.s[0], w7
-; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v1.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v6.s[1], w8
-; CHECK-GI-NOFP16-NEXT: mov v16.s[1], w9
-; CHECK-GI-NOFP16-NEXT: mov v7.s[1], w1
-; CHECK-GI-NOFP16-NEXT: mov v17.s[1], v18.s[0]
+; CHECK-GI-NOFP16-NEXT: mov v5.s[0], w0
+; CHECK-GI-NOFP16-NEXT: mov v6.s[0], w9
+; CHECK-GI-NOFP16-NEXT: mov v7.s[0], w7
+; CHECK-GI-NOFP16-NEXT: ldr s16, [sp]
+; CHECK-GI-NOFP16-NEXT: ldr s17, [sp, #24]
; CHECK-GI-NOFP16-NEXT: ldr s18, [sp, #32]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v6.s[2], w8
-; CHECK-GI-NOFP16-NEXT: mov v16.s[2], w9
-; CHECK-GI-NOFP16-NEXT: mov v7.s[2], w2
-; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v5.4s, v5.4h
-; CHECK-GI-NOFP16-NEXT: mov v2.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v1.s[0], w4
-; CHECK-GI-NOFP16-NEXT: mov v7.s[3], w3
-; CHECK-GI-NOFP16-NEXT: fcmgt v0.4s, v5.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT: ldr s5, [sp, #24]
-; CHECK-GI-NOFP16-NEXT: ldr s4, [sp, #8]
-; CHECK-GI-NOFP16-NEXT: mov v1.s[1], w5
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v4.s[1], w8
+; CHECK-GI-NOFP16-NEXT: mov v5.s[1], w1
+; CHECK-GI-NOFP16-NEXT: mov v17.s[1], v18.s[0]
+; CHECK-GI-NOFP16-NEXT: mov v6.s[1], w9
+; CHECK-GI-NOFP16-NEXT: mov v7.s[1], v16.s[0]
+; CHECK-GI-NOFP16-NEXT: ldr s16, [sp, #8]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: mov v4.s[2], w8
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT: mov v5.s[2], w2
+; CHECK-GI-NOFP16-NEXT: mov v6.s[2], w9
+; CHECK-GI-NOFP16-NEXT: mov v7.s[2], v16.s[0]
+; CHECK-GI-NOFP16-NEXT: ldr s16, [sp, #40]
; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT: mov v5.s[1], v18.s[0]
-; CHECK-GI-NOFP16-NEXT: mov v17.s[2], v4.s[0]
-; CHECK-GI-NOFP16-NEXT: ldr s4, [sp, #40]
-; CHECK-GI-NOFP16-NEXT: ushl v0.4s, v0.4s, v6.4s
-; CHECK-GI-NOFP16-NEXT: neg v6.4s, v6.4s
-; CHECK-GI-NOFP16-NEXT: mov v1.s[2], w6
+; CHECK-GI-NOFP16-NEXT: mov v17.s[2], v16.s[0]
+; CHECK-GI-NOFP16-NEXT: fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov v5.s[3], w3
; CHECK-GI-NOFP16-NEXT: fcmgt v2.4s, v3.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v5.s[2], v4.s[0]
-; CHECK-GI-NOFP16-NEXT: sshl v0.4s, v0.4s, v6.4s
-; CHECK-GI-NOFP16-NEXT: ldr s6, [sp, #16]
-; CHECK-GI-NOFP16-NEXT: mov v17.s[3], v6.s[0]
-; CHECK-GI-NOFP16-NEXT: eor v3.16b, v0.16b, v16.16b
-; CHECK-GI-NOFP16-NEXT: and v0.16b, v1.16b, v0.16b
-; CHECK-GI-NOFP16-NEXT: and v1.16b, v5.16b, v3.16b
-; CHECK-GI-NOFP16-NEXT: bsl v2.16b, v7.16b, v17.16b
-; CHECK-GI-NOFP16-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-GI-NOFP16-NEXT: mov s1, v2.s[1]
-; CHECK-GI-NOFP16-NEXT: mov s3, v2.s[2]
-; CHECK-GI-NOFP16-NEXT: mov s4, v2.s[3]
-; CHECK-GI-NOFP16-NEXT: fmov w0, s2
-; CHECK-GI-NOFP16-NEXT: mov s5, v0.s[1]
-; CHECK-GI-NOFP16-NEXT: mov s6, v0.s[2]
-; CHECK-GI-NOFP16-NEXT: fmov w4, s0
-; CHECK-GI-NOFP16-NEXT: fmov w1, s1
+; CHECK-GI-NOFP16-NEXT: mov v3.s[0], w4
+; CHECK-GI-NOFP16-NEXT: ushl v2.4s, v2.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT: neg v4.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT: mov v3.s[1], w5
+; CHECK-GI-NOFP16-NEXT: sshl v2.4s, v2.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT: ldr s4, [sp, #16]
+; CHECK-GI-NOFP16-NEXT: mov v3.s[2], w6
+; CHECK-GI-NOFP16-NEXT: mov v7.s[3], v4.s[0]
+; CHECK-GI-NOFP16-NEXT: eor v1.16b, v2.16b, v6.16b
+; CHECK-GI-NOFP16-NEXT: and v2.16b, v3.16b, v2.16b
+; CHECK-GI-NOFP16-NEXT: and v1.16b, v17.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT: bsl v0.16b, v5.16b, v7.16b
+; CHECK-GI-NOFP16-NEXT: orr v1.16b, v2.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT: mov s2, v0.s[1]
+; CHECK-GI-NOFP16-NEXT: mov s3, v0.s[2]
+; CHECK-GI-NOFP16-NEXT: mov s4, v0.s[3]
+; CHECK-GI-NOFP16-NEXT: fmov w0, s0
+; CHECK-GI-NOFP16-NEXT: mov s5, v1.s[1]
+; CHECK-GI-NOFP16-NEXT: mov s6, v1.s[2]
+; CHECK-GI-NOFP16-NEXT: fmov w4, s1
+; CHECK-GI-NOFP16-NEXT: fmov w1, s2
; CHECK-GI-NOFP16-NEXT: fmov w2, s3
; CHECK-GI-NOFP16-NEXT: fmov w3, s4
; CHECK-GI-NOFP16-NEXT: fmov w5, s5
@@ -1866,51 +1794,37 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
;
; CHECK-GI-FP16-LABEL: v7f16_i32:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: mov v2.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-GI-FP16-NEXT: fcmgt v0.8h, v1.8h, v0.8h
; CHECK-GI-FP16-NEXT: mov w9, #31 // =0x1f
; CHECK-GI-FP16-NEXT: mov v4.s[0], w0
+; CHECK-GI-FP16-NEXT: mov v2.s[0], w9
; CHECK-GI-FP16-NEXT: mov v5.s[0], w7
; CHECK-GI-FP16-NEXT: ldr s6, [sp]
; CHECK-GI-FP16-NEXT: mov v7.s[0], w4
; CHECK-GI-FP16-NEXT: ldr s16, [sp, #32]
; CHECK-GI-FP16-NEXT: ldr s17, [sp, #8]
-; CHECK-GI-FP16-NEXT: mov v2.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-GI-FP16-NEXT: umov w8, v0.h[4]
+; CHECK-GI-FP16-NEXT: umov w10, v0.h[5]
; CHECK-GI-FP16-NEXT: mov v4.s[1], w1
+; CHECK-GI-FP16-NEXT: mov v2.s[1], w9
; CHECK-GI-FP16-NEXT: mov v5.s[1], v6.s[0]
; CHECK-GI-FP16-NEXT: ldr s6, [sp, #24]
; CHECK-GI-FP16-NEXT: mov v7.s[1], w5
; CHECK-GI-FP16-NEXT: mov v6.s[1], v16.s[0]
; CHECK-GI-FP16-NEXT: ldr s16, [sp, #40]
-; CHECK-GI-FP16-NEXT: mov v2.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-GI-FP16-NEXT: mov v1.s[0], w8
+; CHECK-GI-FP16-NEXT: umov w8, v0.h[6]
+; CHECK-GI-FP16-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-FP16-NEXT: mov v2.s[2], w9
; CHECK-GI-FP16-NEXT: mov v4.s[2], w2
; CHECK-GI-FP16-NEXT: mov v5.s[2], v17.s[0]
; CHECK-GI-FP16-NEXT: mov v7.s[2], w6
+; CHECK-GI-FP16-NEXT: shl v0.4s, v0.4s, #31
; CHECK-GI-FP16-NEXT: mov v6.s[2], v16.s[0]
-; CHECK-GI-FP16-NEXT: mov v2.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT: mov v4.s[3], w3
-; CHECK-GI-FP16-NEXT: mov v2.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT: mov v3.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT: mov v2.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT: mov v3.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT: mov v2.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT: mov v3.h[6], v1.h[6]
-; CHECK-GI-FP16-NEXT: fcmgt v0.8h, v3.8h, v2.8h
-; CHECK-GI-FP16-NEXT: mov v2.s[0], w9
-; CHECK-GI-FP16-NEXT: umov w8, v0.h[4]
-; CHECK-GI-FP16-NEXT: umov w10, v0.h[5]
-; CHECK-GI-FP16-NEXT: mov v2.s[1], w9
-; CHECK-GI-FP16-NEXT: mov v1.s[0], w8
-; CHECK-GI-FP16-NEXT: umov w8, v0.h[6]
-; CHECK-GI-FP16-NEXT: mov v2.s[2], w9
-; CHECK-GI-FP16-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-FP16-NEXT: mov v1.s[1], w10
; CHECK-GI-FP16-NEXT: mov w10, #-1 // =0xffffffff
-; CHECK-GI-FP16-NEXT: shl v0.4s, v0.4s, #31
; CHECK-GI-FP16-NEXT: mov v3.s[0], w10
+; CHECK-GI-FP16-NEXT: mov v4.s[3], w3
; CHECK-GI-FP16-NEXT: sshr v0.4s, v0.4s, #31
; CHECK-GI-FP16-NEXT: mov v1.s[2], w8
; CHECK-GI-FP16-NEXT: mov v3.s[1], w10
diff --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign.ll
index 7f07b088182cae..6eb2d958540bef 100644
--- a/llvm/test/CodeGen/AArch64/fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/fcopysign.ll
@@ -213,46 +213,25 @@ define <7 x half> @copysign_v7f16(<7 x half> %a, <7 x half> %b) {
;
; CHECK-GI-LABEL: copysign_v7f16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v2.h[0], v0.h[0]
-; CHECK-GI-NEXT: mov v3.h[0], v1.h[0]
; CHECK-GI-NEXT: mov w8, #32768 // =0x8000
; CHECK-GI-NEXT: mov w9, #32767 // =0x7fff
-; CHECK-GI-NEXT: fmov s5, w8
-; CHECK-GI-NEXT: fmov s4, w9
-; CHECK-GI-NEXT: mov v2.h[1], v0.h[1]
-; CHECK-GI-NEXT: mov v5.h[1], w8
-; CHECK-GI-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-GI-NEXT: mov v4.h[1], w9
-; CHECK-GI-NEXT: mov v2.h[2], v0.h[2]
-; CHECK-GI-NEXT: mov v5.h[2], w8
-; CHECK-GI-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-GI-NEXT: mov v4.h[2], w9
-; CHECK-GI-NEXT: mov v2.h[3], v0.h[3]
-; CHECK-GI-NEXT: mov v5.h[3], w8
-; CHECK-GI-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-GI-NEXT: mov v4.h[3], w9
-; CHECK-GI-NEXT: mov v2.h[4], v0.h[4]
-; CHECK-GI-NEXT: mov v5.h[4], w8
-; CHECK-GI-NEXT: mov v3.h[4], v1.h[4]
-; CHECK-GI-NEXT: mov v4.h[4], w9
-; CHECK-GI-NEXT: mov v2.h[5], v0.h[5]
-; CHECK-GI-NEXT: mov v5.h[5], w8
-; CHECK-GI-NEXT: mov v3.h[5], v1.h[5]
-; CHECK-GI-NEXT: mov v4.h[5], w9
-; CHECK-GI-NEXT: mov v2.h[6], v0.h[6]
-; CHECK-GI-NEXT: mov v3.h[6], v1.h[6]
-; CHECK-GI-NEXT: mov v5.h[6], w8
-; CHECK-GI-NEXT: mov v4.h[6], w9
-; CHECK-GI-NEXT: and v1.16b, v3.16b, v5.16b
-; CHECK-GI-NEXT: and v0.16b, v2.16b, v4.16b
-; CHECK-GI-NEXT: orr v1.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT: fmov s2, w9
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: mov v2.h[1], w9
+; CHECK-GI-NEXT: mov v3.h[1], w8
+; CHECK-GI-NEXT: mov v2.h[2], w9
+; CHECK-GI-NEXT: mov v3.h[2], w8
+; CHECK-GI-NEXT: mov v2.h[3], w9
+; CHECK-GI-NEXT: mov v3.h[3], w8
+; CHECK-GI-NEXT: mov v2.h[4], w9
+; CHECK-GI-NEXT: mov v3.h[4], w8
+; CHECK-GI-NEXT: mov v2.h[5], w9
+; CHECK-GI-NEXT: mov v3.h[5], w8
+; CHECK-GI-NEXT: mov v2.h[6], w9
+; CHECK-GI-NEXT: mov v3.h[6], w8
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-GI-NEXT: ret
entry:
%c = call <7 x half> @llvm.copysign.v7f16(<7 x half> %a, <7 x half> %b)
diff --git a/llvm/test/CodeGen/AArch64/fcvt.ll b/llvm/test/CodeGen/AArch64/fcvt.ll
index 55d9984c6392f5..15a8f0557cc417 100644
--- a/llvm/test/CodeGen/AArch64/fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt.ll
@@ -175,52 +175,27 @@ define <7 x half> @ceil_v7f16(<7 x half> %a) {
;
; CHECK-GI-NOFP16-LABEL: ceil_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: frintp v0.4s, v0.4s
; CHECK-GI-NOFP16-NEXT: frintp v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[4], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[5], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[6], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v3.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v3.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v3.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v3.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: frintp v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: ceil_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT: mov v1.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT: mov v1.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT: mov v1.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT: mov v1.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT: frintp v1.8h, v1.8h
-; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT: frintp v0.8h, v0.8h
; CHECK-GI-FP16-NEXT: ret
entry:
%c = call <7 x half> @llvm.ceil.v7f16(<7 x half> %a)
@@ -511,52 +486,27 @@ define <7 x half> @floor_v7f16(<7 x half> %a) {
;
; CHECK-GI-NOFP16-LABEL: floor_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: frintm v0.4s, v0.4s
; CHECK-GI-NOFP16-NEXT: frintm v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[4], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[5], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[6], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v3.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v3.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v3.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v3.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: frintm v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: floor_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT: mov v1.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT: mov v1.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT: mov v1.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT: mov v1.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT: frintm v1.8h, v1.8h
-; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT: frintm v0.8h, v0.8h
; CHECK-GI-FP16-NEXT: ret
entry:
%c = call <7 x half> @llvm.floor.v7f16(<7 x half> %a)
@@ -847,52 +797,27 @@ define <7 x half> @nearbyint_v7f16(<7 x half> %a) {
;
; CHECK-GI-NOFP16-LABEL: nearbyint_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: frinti v0.4s, v0.4s
; CHECK-GI-NOFP16-NEXT: frinti v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[4], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[5], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[6], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v3.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v3.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v3.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v3.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: frinti v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: nearbyint_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT: mov v1.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT: mov v1.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT: mov v1.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT: mov v1.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT: frinti v1.8h, v1.8h
-; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT: frinti v0.8h, v0.8h
; CHECK-GI-FP16-NEXT: ret
entry:
%c = call <7 x half> @llvm.nearbyint.v7f16(<7 x half> %a)
@@ -1183,52 +1108,27 @@ define <7 x half> @roundeven_v7f16(<7 x half> %a) {
;
; CHECK-GI-NOFP16-LABEL: roundeven_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: frintn v0.4s, v0.4s
; CHECK-GI-NOFP16-NEXT: frintn v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[4], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[5], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[6], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v3.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v3.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v3.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v3.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: frintn v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: roundeven_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT: mov v1.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT: mov v1.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT: mov v1.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT: mov v1.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT: frintn v1.8h, v1.8h
-; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT: frintn v0.8h, v0.8h
; CHECK-GI-FP16-NEXT: ret
entry:
%c = call <7 x half> @llvm.roundeven.v7f16(<7 x half> %a)
@@ -1519,52 +1419,27 @@ define <7 x half> @rint_v7f16(<7 x half> %a) {
;
; CHECK-GI-NOFP16-LABEL: rint_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: frintx v0.4s, v0.4s
; CHECK-GI-NOFP16-NEXT: frintx v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[4], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[5], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[6], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v3.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v3.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v3.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v3.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: frintx v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: rint_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT: mov v1.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT: mov v1.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT: mov v1.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT: mov v1.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT: frintx v1.8h, v1.8h
-; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT: frintx v0.8h, v0.8h
; CHECK-GI-FP16-NEXT: ret
entry:
%c = call <7 x half> @llvm.rint.v7f16(<7 x half> %a)
@@ -1855,52 +1730,27 @@ define <7 x half> @round_v7f16(<7 x half> %a) {
;
; CHECK-GI-NOFP16-LABEL: round_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: frinta v0.4s, v0.4s
; CHECK-GI-NOFP16-NEXT: frinta v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[4], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[5], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[6], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v3.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v3.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v3.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v3.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: frinta v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: round_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT: mov v1.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT: mov v1.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT: mov v1.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT: mov v1.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT: frinta v1.8h, v1.8h
-; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT: frinta v0.8h, v0.8h
; CHECK-GI-FP16-NEXT: ret
entry:
%c = call <7 x half> @llvm.round.v7f16(<7 x half> %a)
@@ -2191,52 +2041,27 @@ define <7 x half> @trunc_v7f16(<7 x half> %a) {
;
; CHECK-GI-NOFP16-LABEL: trunc_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: frintz v0.4s, v0.4s
; CHECK-GI-NOFP16-NEXT: frintz v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[4], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[5], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[6], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v3.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v3.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v3.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v3.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: frintz v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: trunc_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT: mov v1.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT: mov v1.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT: mov v1.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT: mov v1.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT: frintz v1.8h, v1.8h
-; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT: frintz v0.8h, v0.8h
; CHECK-GI-FP16-NEXT: ret
entry:
%c = call <7 x half> @llvm.trunc.v7f16(<7 x half> %a)
diff --git a/llvm/test/CodeGen/AArch64/fdiv.ll b/llvm/test/CodeGen/AArch64/fdiv.ll
index 9acd0166fcaa85..82ce3af7e614f1 100644
--- a/llvm/test/CodeGen/AArch64/fdiv.ll
+++ b/llvm/test/CodeGen/AArch64/fdiv.ll
@@ -201,68 +201,32 @@ define <7 x half> @fdiv_v7f16(<7 x half> %a, <7 x half> %b) {
;
; CHECK-GI-NOFP16-LABEL: fdiv_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h
; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
; CHECK-GI-NOFP16-NEXT: fdiv v2.4s, v2.4s, v3.4s
; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6]
; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v4.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v3.4h
; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT: fdiv v0.4s, v0.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[4], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[5], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[6], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-NOFP16-NEXT: fdiv v1.4s, v0.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fdiv_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: mov v2.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT: mov v3.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT: mov v2.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT: mov v2.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT: mov v2.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT: mov v2.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT: mov v3.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT: mov v2.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT: mov v3.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT: mov v2.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT: mov v3.h[6], v1.h[6]
-; CHECK-GI-FP16-NEXT: fdiv v1.8h, v2.8h, v3.8h
-; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT: fdiv v0.8h, v0.8h, v1.8h
; CHECK-GI-FP16-NEXT: ret
entry:
%c = fdiv <7 x half> %a, %b
diff --git a/llvm/test/CodeGen/AArch64/fexplog.ll b/llvm/test/CodeGen/AArch64/fexplog.ll
index 6072a2c56a06d1..08068ac4f10881 100644
--- a/llvm/test/CodeGen/AArch64/fexplog.ll
+++ b/llvm/test/CodeGen/AArch64/fexplog.ll
@@ -732,13 +732,7 @@ define <7 x half> @exp_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT: mov v0.16b, v1.16b
; CHECK-GI-NEXT: add sp, sp, #160
; CHECK-GI-NEXT: ret
entry:
@@ -2047,13 +2041,7 @@ define <7 x half> @exp2_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT: mov v0.16b, v1.16b
; CHECK-GI-NEXT: add sp, sp, #160
; CHECK-GI-NEXT: ret
entry:
@@ -3362,13 +3350,7 @@ define <7 x half> @log_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT: mov v0.16b, v1.16b
; CHECK-GI-NEXT: add sp, sp, #160
; CHECK-GI-NEXT: ret
entry:
@@ -4677,13 +4659,7 @@ define <7 x half> @log2_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT: mov v0.16b, v1.16b
; CHECK-GI-NEXT: add sp, sp, #160
; CHECK-GI-NEXT: ret
entry:
@@ -5992,13 +5968,7 @@ define <7 x half> @log10_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT: mov v0.16b, v1.16b
; CHECK-GI-NEXT: add sp, sp, #160
; CHECK-GI-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/fminimummaximum.ll b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
index c2e91a9956af91..e8201f62599b75 100644
--- a/llvm/test/CodeGen/AArch64/fminimummaximum.ll
+++ b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
@@ -692,68 +692,32 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
;
; CHECK-NOFP16-GI-LABEL: min_v7f16:
; CHECK-NOFP16-GI: // %bb.0: // %entry
-; CHECK-NOFP16-GI-NEXT: mov v2.h[0], v0.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v0.4h
+; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v1.4h
; CHECK-NOFP16-GI-NEXT: mov v4.h[0], v0.h[4]
-; CHECK-NOFP16-GI-NEXT: mov v2.h[1], v0.h[1]
-; CHECK-NOFP16-GI-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v0.h[5]
-; CHECK-NOFP16-GI-NEXT: mov v2.h[2], v0.h[2]
-; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6]
-; CHECK-NOFP16-GI-NEXT: mov v2.h[3], v0.h[3]
-; CHECK-NOFP16-GI-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-NOFP16-GI-NEXT: fcvtl v0.4s, v4.4h
-; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h
; CHECK-NOFP16-GI-NEXT: fmin v2.4s, v2.4s, v3.4s
; CHECK-NOFP16-GI-NEXT: mov v3.h[0], v1.h[4]
+; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v0.h[5]
; CHECK-NOFP16-GI-NEXT: mov v3.h[1], v1.h[5]
; CHECK-NOFP16-GI-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6]
; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT: mov v1.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT: fcvtl v1.4s, v4.4h
; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h
-; CHECK-NOFP16-GI-NEXT: mov v1.h[1], v2.h[1]
-; CHECK-NOFP16-GI-NEXT: fmin v0.4s, v0.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT: mov v1.h[2], v2.h[2]
-; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-NOFP16-GI-NEXT: mov v1.h[3], v2.h[3]
-; CHECK-NOFP16-GI-NEXT: mov v1.h[4], v0.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v1.h[5], v0.h[1]
-; CHECK-NOFP16-GI-NEXT: mov v1.h[6], v0.h[2]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT: fmin v1.4s, v1.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v2.h[2]
+; CHECK-NOFP16-GI-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v2.h[3]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[4], v1.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v1.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[2]
; CHECK-NOFP16-GI-NEXT: ret
;
; CHECK-FP16-GI-LABEL: min_v7f16:
; CHECK-FP16-GI: // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT: mov v2.h[0], v0.h[0]
-; CHECK-FP16-GI-NEXT: mov v3.h[0], v1.h[0]
-; CHECK-FP16-GI-NEXT: mov v2.h[1], v0.h[1]
-; CHECK-FP16-GI-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-FP16-GI-NEXT: mov v2.h[2], v0.h[2]
-; CHECK-FP16-GI-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-FP16-GI-NEXT: mov v2.h[3], v0.h[3]
-; CHECK-FP16-GI-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-FP16-GI-NEXT: mov v2.h[4], v0.h[4]
-; CHECK-FP16-GI-NEXT: mov v3.h[4], v1.h[4]
-; CHECK-FP16-GI-NEXT: mov v2.h[5], v0.h[5]
-; CHECK-FP16-GI-NEXT: mov v3.h[5], v1.h[5]
-; CHECK-FP16-GI-NEXT: mov v2.h[6], v0.h[6]
-; CHECK-FP16-GI-NEXT: mov v3.h[6], v1.h[6]
-; CHECK-FP16-GI-NEXT: fmin v1.8h, v2.8h, v3.8h
-; CHECK-FP16-GI-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-FP16-GI-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-FP16-GI-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-FP16-GI-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-FP16-GI-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-FP16-GI-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-FP16-GI-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-FP16-GI-NEXT: fmin v0.8h, v0.8h, v1.8h
; CHECK-FP16-GI-NEXT: ret
entry:
%c = call <7 x half> @llvm.minimum.v7f16(<7 x half> %a, <7 x half> %b)
@@ -826,68 +790,32 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
;
; CHECK-NOFP16-GI-LABEL: max_v7f16:
; CHECK-NOFP16-GI: // %bb.0: // %entry
-; CHECK-NOFP16-GI-NEXT: mov v2.h[0], v0.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v0.4h
+; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v1.4h
; CHECK-NOFP16-GI-NEXT: mov v4.h[0], v0.h[4]
-; CHECK-NOFP16-GI-NEXT: mov v2.h[1], v0.h[1]
-; CHECK-NOFP16-GI-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v0.h[5]
-; CHECK-NOFP16-GI-NEXT: mov v2.h[2], v0.h[2]
-; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6]
-; CHECK-NOFP16-GI-NEXT: mov v2.h[3], v0.h[3]
-; CHECK-NOFP16-GI-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-NOFP16-GI-NEXT: fcvtl v0.4s, v4.4h
-; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h
; CHECK-NOFP16-GI-NEXT: fmax v2.4s, v2.4s, v3.4s
; CHECK-NOFP16-GI-NEXT: mov v3.h[0], v1.h[4]
+; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v0.h[5]
; CHECK-NOFP16-GI-NEXT: mov v3.h[1], v1.h[5]
; CHECK-NOFP16-GI-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6]
; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT: mov v1.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT: fcvtl v1.4s, v4.4h
; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h
-; CHECK-NOFP16-GI-NEXT: mov v1.h[1], v2.h[1]
-; CHECK-NOFP16-GI-NEXT: fmax v0.4s, v0.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT: mov v1.h[2], v2.h[2]
-; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-NOFP16-GI-NEXT: mov v1.h[3], v2.h[3]
-; CHECK-NOFP16-GI-NEXT: mov v1.h[4], v0.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v1.h[5], v0.h[1]
-; CHECK-NOFP16-GI-NEXT: mov v1.h[6], v0.h[2]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT: fmax v1.4s, v1.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v2.h[2]
+; CHECK-NOFP16-GI-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v2.h[3]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[4], v1.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v1.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[2]
; CHECK-NOFP16-GI-NEXT: ret
;
; CHECK-FP16-GI-LABEL: max_v7f16:
; CHECK-FP16-GI: // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT: mov v2.h[0], v0.h[0]
-; CHECK-FP16-GI-NEXT: mov v3.h[0], v1.h[0]
-; CHECK-FP16-GI-NEXT: mov v2.h[1], v0.h[1]
-; CHECK-FP16-GI-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-FP16-GI-NEXT: mov v2.h[2], v0.h[2]
-; CHECK-FP16-GI-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-FP16-GI-NEXT: mov v2.h[3], v0.h[3]
-; CHECK-FP16-GI-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-FP16-GI-NEXT: mov v2.h[4], v0.h[4]
-; CHECK-FP16-GI-NEXT: mov v3.h[4], v1.h[4]
-; CHECK-FP16-GI-NEXT: mov v2.h[5], v0.h[5]
-; CHECK-FP16-GI-NEXT: mov v3.h[5], v1.h[5]
-; CHECK-FP16-GI-NEXT: mov v2.h[6], v0.h[6]
-; CHECK-FP16-GI-NEXT: mov v3.h[6], v1.h[6]
-; CHECK-FP16-GI-NEXT: fmax v1.8h, v2.8h, v3.8h
-; CHECK-FP16-GI-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-FP16-GI-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-FP16-GI-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-FP16-GI-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-FP16-GI-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-FP16-GI-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-FP16-GI-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-FP16-GI-NEXT: fmax v0.8h, v0.8h, v1.8h
; CHECK-FP16-GI-NEXT: ret
entry:
%c = call <7 x half> @llvm.maximum.v7f16(<7 x half> %a, <7 x half> %b)
diff --git a/llvm/test/CodeGen/AArch64/fminmax.ll b/llvm/test/CodeGen/AArch64/fminmax.ll
index b7af6be8721d68..8a613907807c4f 100644
--- a/llvm/test/CodeGen/AArch64/fminmax.ll
+++ b/llvm/test/CodeGen/AArch64/fminmax.ll
@@ -692,68 +692,32 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
;
; CHECK-NOFP16-GI-LABEL: min_v7f16:
; CHECK-NOFP16-GI: // %bb.0: // %entry
-; CHECK-NOFP16-GI-NEXT: mov v2.h[0], v0.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v0.4h
+; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v1.4h
; CHECK-NOFP16-GI-NEXT: mov v4.h[0], v0.h[4]
-; CHECK-NOFP16-GI-NEXT: mov v2.h[1], v0.h[1]
-; CHECK-NOFP16-GI-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v0.h[5]
-; CHECK-NOFP16-GI-NEXT: mov v2.h[2], v0.h[2]
-; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6]
-; CHECK-NOFP16-GI-NEXT: mov v2.h[3], v0.h[3]
-; CHECK-NOFP16-GI-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-NOFP16-GI-NEXT: fcvtl v0.4s, v4.4h
-; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h
; CHECK-NOFP16-GI-NEXT: fminnm v2.4s, v2.4s, v3.4s
; CHECK-NOFP16-GI-NEXT: mov v3.h[0], v1.h[4]
+; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v0.h[5]
; CHECK-NOFP16-GI-NEXT: mov v3.h[1], v1.h[5]
; CHECK-NOFP16-GI-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6]
; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT: mov v1.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT: fcvtl v1.4s, v4.4h
; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h
-; CHECK-NOFP16-GI-NEXT: mov v1.h[1], v2.h[1]
-; CHECK-NOFP16-GI-NEXT: fminnm v0.4s, v0.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT: mov v1.h[2], v2.h[2]
-; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-NOFP16-GI-NEXT: mov v1.h[3], v2.h[3]
-; CHECK-NOFP16-GI-NEXT: mov v1.h[4], v0.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v1.h[5], v0.h[1]
-; CHECK-NOFP16-GI-NEXT: mov v1.h[6], v0.h[2]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT: fminnm v1.4s, v1.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v2.h[2]
+; CHECK-NOFP16-GI-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v2.h[3]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[4], v1.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v1.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[2]
; CHECK-NOFP16-GI-NEXT: ret
;
; CHECK-FP16-GI-LABEL: min_v7f16:
; CHECK-FP16-GI: // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT: mov v2.h[0], v0.h[0]
-; CHECK-FP16-GI-NEXT: mov v3.h[0], v1.h[0]
-; CHECK-FP16-GI-NEXT: mov v2.h[1], v0.h[1]
-; CHECK-FP16-GI-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-FP16-GI-NEXT: mov v2.h[2], v0.h[2]
-; CHECK-FP16-GI-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-FP16-GI-NEXT: mov v2.h[3], v0.h[3]
-; CHECK-FP16-GI-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-FP16-GI-NEXT: mov v2.h[4], v0.h[4]
-; CHECK-FP16-GI-NEXT: mov v3.h[4], v1.h[4]
-; CHECK-FP16-GI-NEXT: mov v2.h[5], v0.h[5]
-; CHECK-FP16-GI-NEXT: mov v3.h[5], v1.h[5]
-; CHECK-FP16-GI-NEXT: mov v2.h[6], v0.h[6]
-; CHECK-FP16-GI-NEXT: mov v3.h[6], v1.h[6]
-; CHECK-FP16-GI-NEXT: fminnm v1.8h, v2.8h, v3.8h
-; CHECK-FP16-GI-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-FP16-GI-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-FP16-GI-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-FP16-GI-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-FP16-GI-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-FP16-GI-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-FP16-GI-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-FP16-GI-NEXT: fminnm v0.8h, v0.8h, v1.8h
; CHECK-FP16-GI-NEXT: ret
entry:
%c = call <7 x half> @llvm.minnum.v7f16(<7 x half> %a, <7 x half> %b)
@@ -826,68 +790,32 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
;
; CHECK-NOFP16-GI-LABEL: max_v7f16:
; CHECK-NOFP16-GI: // %bb.0: // %entry
-; CHECK-NOFP16-GI-NEXT: mov v2.h[0], v0.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v0.4h
+; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v1.4h
; CHECK-NOFP16-GI-NEXT: mov v4.h[0], v0.h[4]
-; CHECK-NOFP16-GI-NEXT: mov v2.h[1], v0.h[1]
-; CHECK-NOFP16-GI-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v0.h[5]
-; CHECK-NOFP16-GI-NEXT: mov v2.h[2], v0.h[2]
-; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6]
-; CHECK-NOFP16-GI-NEXT: mov v2.h[3], v0.h[3]
-; CHECK-NOFP16-GI-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-NOFP16-GI-NEXT: fcvtl v0.4s, v4.4h
-; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h
; CHECK-NOFP16-GI-NEXT: fmaxnm v2.4s, v2.4s, v3.4s
; CHECK-NOFP16-GI-NEXT: mov v3.h[0], v1.h[4]
+; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v0.h[5]
; CHECK-NOFP16-GI-NEXT: mov v3.h[1], v1.h[5]
; CHECK-NOFP16-GI-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6]
; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT: mov v1.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT: fcvtl v1.4s, v4.4h
; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h
-; CHECK-NOFP16-GI-NEXT: mov v1.h[1], v2.h[1]
-; CHECK-NOFP16-GI-NEXT: fmaxnm v0.4s, v0.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT: mov v1.h[2], v2.h[2]
-; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-NOFP16-GI-NEXT: mov v1.h[3], v2.h[3]
-; CHECK-NOFP16-GI-NEXT: mov v1.h[4], v0.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v1.h[5], v0.h[1]
-; CHECK-NOFP16-GI-NEXT: mov v1.h[6], v0.h[2]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT: fmaxnm v1.4s, v1.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v2.h[2]
+; CHECK-NOFP16-GI-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v2.h[3]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[4], v1.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v1.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[2]
; CHECK-NOFP16-GI-NEXT: ret
;
; CHECK-FP16-GI-LABEL: max_v7f16:
; CHECK-FP16-GI: // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT: mov v2.h[0], v0.h[0]
-; CHECK-FP16-GI-NEXT: mov v3.h[0], v1.h[0]
-; CHECK-FP16-GI-NEXT: mov v2.h[1], v0.h[1]
-; CHECK-FP16-GI-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-FP16-GI-NEXT: mov v2.h[2], v0.h[2]
-; CHECK-FP16-GI-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-FP16-GI-NEXT: mov v2.h[3], v0.h[3]
-; CHECK-FP16-GI-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-FP16-GI-NEXT: mov v2.h[4], v0.h[4]
-; CHECK-FP16-GI-NEXT: mov v3.h[4], v1.h[4]
-; CHECK-FP16-GI-NEXT: mov v2.h[5], v0.h[5]
-; CHECK-FP16-GI-NEXT: mov v3.h[5], v1.h[5]
-; CHECK-FP16-GI-NEXT: mov v2.h[6], v0.h[6]
-; CHECK-FP16-GI-NEXT: mov v3.h[6], v1.h[6]
-; CHECK-FP16-GI-NEXT: fmaxnm v1.8h, v2.8h, v3.8h
-; CHECK-FP16-GI-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-FP16-GI-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-FP16-GI-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-FP16-GI-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-FP16-GI-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-FP16-GI-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-FP16-GI-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-FP16-GI-NEXT: fmaxnm v0.8h, v0.8h, v1.8h
; CHECK-FP16-GI-NEXT: ret
entry:
%c = call <7 x half> @llvm.maxnum.v7f16(<7 x half> %a, <7 x half> %b)
diff --git a/llvm/test/CodeGen/AArch64/fmla.ll b/llvm/test/CodeGen/AArch64/fmla.ll
index 0a9d4c7b657e06..7ed9425ed42e90 100644
--- a/llvm/test/CodeGen/AArch64/fmla.ll
+++ b/llvm/test/CodeGen/AArch64/fmla.ll
@@ -272,84 +272,38 @@ define <7 x half> @fma_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
;
; CHECK-GI-NOFP16-LABEL: fma_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v5.4s, v2.4h
; CHECK-GI-NOFP16-NEXT: mov v6.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v5.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v6.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v5.4s, v5.4h
; CHECK-GI-NOFP16-NEXT: fmla v5.4s, v4.4s, v3.4s
; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[4]
; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v2.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[5]
; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v2.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v5.4h, v5.4s
+; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v0.h[6]
; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6]
; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v2.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v5.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v5.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v6.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v3.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v5.h[1]
-; CHECK-GI-NOFP16-NEXT: fmla v3.4s, v2.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v5.h[2]
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v3.4s
-; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v5.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[4], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[5], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[6], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v5.h[1]
+; CHECK-GI-NOFP16-NEXT: fmla v3.4s, v2.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v5.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v3.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v5.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fma_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: mov v3.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT: mov v4.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT: mov v5.h[0], v2.h[0]
-; CHECK-GI-FP16-NEXT: mov v3.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT: mov v4.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT: mov v5.h[1], v2.h[1]
-; CHECK-GI-FP16-NEXT: mov v3.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT: mov v4.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT: mov v5.h[2], v2.h[2]
-; CHECK-GI-FP16-NEXT: mov v3.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT: mov v4.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT: mov v5.h[3], v2.h[3]
-; CHECK-GI-FP16-NEXT: mov v3.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT: mov v4.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT: mov v5.h[4], v2.h[4]
-; CHECK-GI-FP16-NEXT: mov v3.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT: mov v4.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT: mov v5.h[5], v2.h[5]
-; CHECK-GI-FP16-NEXT: mov v3.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT: mov v4.h[6], v1.h[6]
-; CHECK-GI-FP16-NEXT: mov v5.h[6], v2.h[6]
-; CHECK-GI-FP16-NEXT: fmla v5.8h, v4.8h, v3.8h
-; CHECK-GI-FP16-NEXT: mov v0.h[0], v5.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v5.h[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[2], v5.h[2]
-; CHECK-GI-FP16-NEXT: mov v0.h[3], v5.h[3]
-; CHECK-GI-FP16-NEXT: mov v0.h[4], v5.h[4]
-; CHECK-GI-FP16-NEXT: mov v0.h[5], v5.h[5]
-; CHECK-GI-FP16-NEXT: mov v0.h[6], v5.h[6]
+; CHECK-GI-FP16-NEXT: fmla v2.8h, v1.8h, v0.8h
+; CHECK-GI-FP16-NEXT: mov v0.16b, v2.16b
; CHECK-GI-FP16-NEXT: ret
entry:
%d = call <7 x half> @llvm.fma.v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c)
@@ -934,90 +888,44 @@ define <7 x half> @fmuladd_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
;
; CHECK-GI-NOFP16-LABEL: fmuladd_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v6.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v5.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v0.h[4]
; CHECK-GI-NOFP16-NEXT: fmul v3.4s, v3.4s, v4.4s
; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v1.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v3.4h, v3.4s
+; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v0.h[6]
; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v1.h[6]
; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v5.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v6.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v5.4h
; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v2.h[4]
; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h
; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v0.4s, v1.4s
; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v2.h[5]
; CHECK-GI-NOFP16-NEXT: fmul v1.4s, v3.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v3.4h, v0.4s
; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v2.h[6]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v5.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v1.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1]
+; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v1.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[2]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v2.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[5], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[6], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v3.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fmuladd_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: mov v3.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT: mov v4.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT: mov v5.h[0], v2.h[0]
-; CHECK-GI-FP16-NEXT: mov v3.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT: mov v4.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT: mov v5.h[1], v2.h[1]
-; CHECK-GI-FP16-NEXT: mov v3.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT: mov v4.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT: mov v5.h[2], v2.h[2]
-; CHECK-GI-FP16-NEXT: mov v3.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT: mov v4.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT: mov v5.h[3], v2.h[3]
-; CHECK-GI-FP16-NEXT: mov v3.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT: mov v4.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT: mov v5.h[4], v2.h[4]
-; CHECK-GI-FP16-NEXT: mov v3.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT: mov v4.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT: mov v5.h[5], v2.h[5]
-; CHECK-GI-FP16-NEXT: mov v3.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT: mov v4.h[6], v1.h[6]
-; CHECK-GI-FP16-NEXT: mov v5.h[6], v2.h[6]
-; CHECK-GI-FP16-NEXT: fmla v5.8h, v4.8h, v3.8h
-; CHECK-GI-FP16-NEXT: mov v0.h[0], v5.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v5.h[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[2], v5.h[2]
-; CHECK-GI-FP16-NEXT: mov v0.h[3], v5.h[3]
-; CHECK-GI-FP16-NEXT: mov v0.h[4], v5.h[4]
-; CHECK-GI-FP16-NEXT: mov v0.h[5], v5.h[5]
-; CHECK-GI-FP16-NEXT: mov v0.h[6], v5.h[6]
+; CHECK-GI-FP16-NEXT: fmla v2.8h, v1.8h, v0.8h
+; CHECK-GI-FP16-NEXT: mov v0.16b, v2.16b
; CHECK-GI-FP16-NEXT: ret
entry:
%d = call <7 x half> @llvm.fmuladd.v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c)
@@ -1480,90 +1388,44 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
;
; CHECK-GI-NOFP16-LABEL: fmul_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v6.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v5.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v0.h[4]
; CHECK-GI-NOFP16-NEXT: fmul v3.4s, v3.4s, v4.4s
; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v1.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v3.4h, v3.4s
+; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v0.h[6]
; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v1.h[6]
; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v5.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v6.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v5.4h
; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v2.h[4]
; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h
; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v0.4s, v1.4s
; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v2.h[5]
; CHECK-GI-NOFP16-NEXT: fmul v1.4s, v3.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v3.4h, v0.4s
; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v2.h[6]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v5.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v1.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1]
+; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v1.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[2]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v2.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[5], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[6], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v3.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fmul_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: mov v3.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT: mov v4.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT: mov v5.h[0], v2.h[0]
-; CHECK-GI-FP16-NEXT: mov v3.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT: mov v4.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT: mov v5.h[1], v2.h[1]
-; CHECK-GI-FP16-NEXT: mov v3.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT: mov v4.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT: mov v5.h[2], v2.h[2]
-; CHECK-GI-FP16-NEXT: mov v3.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT: mov v4.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT: mov v5.h[3], v2.h[3]
-; CHECK-GI-FP16-NEXT: mov v3.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT: mov v4.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT: mov v5.h[4], v2.h[4]
-; CHECK-GI-FP16-NEXT: mov v3.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT: mov v4.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT: mov v5.h[5], v2.h[5]
-; CHECK-GI-FP16-NEXT: mov v3.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT: mov v4.h[6], v1.h[6]
-; CHECK-GI-FP16-NEXT: mov v5.h[6], v2.h[6]
-; CHECK-GI-FP16-NEXT: fmla v5.8h, v3.8h, v4.8h
-; CHECK-GI-FP16-NEXT: mov v0.h[0], v5.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v5.h[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[2], v5.h[2]
-; CHECK-GI-FP16-NEXT: mov v0.h[3], v5.h[3]
-; CHECK-GI-FP16-NEXT: mov v0.h[4], v5.h[4]
-; CHECK-GI-FP16-NEXT: mov v0.h[5], v5.h[5]
-; CHECK-GI-FP16-NEXT: mov v0.h[6], v5.h[6]
+; CHECK-GI-FP16-NEXT: fmla v2.8h, v0.8h, v1.8h
+; CHECK-GI-FP16-NEXT: mov v0.16b, v2.16b
; CHECK-GI-FP16-NEXT: ret
entry:
%d = fmul fast <7 x half> %a, %b
diff --git a/llvm/test/CodeGen/AArch64/fmul.ll b/llvm/test/CodeGen/AArch64/fmul.ll
index de6618ac18f157..f045c5ab96c4e6 100644
--- a/llvm/test/CodeGen/AArch64/fmul.ll
+++ b/llvm/test/CodeGen/AArch64/fmul.ll
@@ -201,68 +201,32 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b) {
;
; CHECK-GI-NOFP16-LABEL: fmul_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h
; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
; CHECK-GI-NOFP16-NEXT: fmul v2.4s, v2.4s, v3.4s
; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6]
; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v4.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT: fmul v0.4s, v0.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[4], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[5], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[6], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: fmul v1.4s, v1.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v2.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v2.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fmul_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: mov v2.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT: mov v3.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT: mov v2.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT: mov v2.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT: mov v2.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT: mov v2.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT: mov v3.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT: mov v2.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT: mov v3.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT: mov v2.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT: mov v3.h[6], v1.h[6]
-; CHECK-GI-FP16-NEXT: fmul v1.8h, v2.8h, v3.8h
-; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT: fmul v0.8h, v0.8h, v1.8h
; CHECK-GI-FP16-NEXT: ret
entry:
%c = fmul <7 x half> %a, %b
diff --git a/llvm/test/CodeGen/AArch64/fneg.ll b/llvm/test/CodeGen/AArch64/fneg.ll
index dd6266e8b3b1f4..bcd4bcf4c2b0b9 100644
--- a/llvm/test/CodeGen/AArch64/fneg.ll
+++ b/llvm/test/CodeGen/AArch64/fneg.ll
@@ -175,41 +175,13 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
;
; CHECK-GI-NOFP16-LABEL: fabs_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[0]
; CHECK-GI-NOFP16-NEXT: movi v1.8h, #128, lsl #8
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[3], v0.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[4], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[5], v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[6], v0.h[6]
-; CHECK-GI-NOFP16-NEXT: eor v1.16b, v2.16b, v1.16b
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-NOFP16-NEXT: eor v0.16b, v0.16b, v1.16b
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fabs_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT: mov v1.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT: mov v1.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT: mov v1.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT: mov v1.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT: fneg v1.8h, v1.8h
-; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT: fneg v0.8h, v0.8h
; CHECK-GI-FP16-NEXT: ret
entry:
%c = fneg <7 x half> %a
diff --git a/llvm/test/CodeGen/AArch64/fpow.ll b/llvm/test/CodeGen/AArch64/fpow.ll
index fb7efe82582322..08589d647d189c 100644
--- a/llvm/test/CodeGen/AArch64/fpow.ll
+++ b/llvm/test/CodeGen/AArch64/fpow.ll
@@ -885,13 +885,7 @@ define <7 x half> @pow_v7f16(<7 x half> %a, <7 x half> %b) {
; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT: mov v0.16b, v1.16b
; CHECK-GI-NEXT: add sp, sp, #176
; CHECK-GI-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/fpowi.ll b/llvm/test/CodeGen/AArch64/fpowi.ll
index 3f122ee06d99a9..af81d5fa5bf6fd 100644
--- a/llvm/test/CodeGen/AArch64/fpowi.ll
+++ b/llvm/test/CodeGen/AArch64/fpowi.ll
@@ -815,13 +815,7 @@ define <7 x half> @powi_v7f16(<7 x half> %a, i32 %b) {
; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT: mov v0.16b, v1.16b
; CHECK-GI-NEXT: add sp, sp, #160
; CHECK-GI-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index a7c51ea2b9ace1..8dae8328f3ceb6 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -397,35 +397,36 @@ define <5 x i32> @test_signed_v5f64_v5i32(<5 x double> %f) {
; CHECK-GI-NEXT: // kill: def $d4 killed $d4 def $q4
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: mov v2.d[1], v3.d[0]
-; CHECK-GI-NEXT: fcvtzs v4.2d, v4.2d
+; CHECK-GI-NEXT: fcvtzs v3.2d, v4.2d
; CHECK-GI-NEXT: fcvtzs v0.2d, v0.2d
; CHECK-GI-NEXT: fcvtzs v1.2d, v2.2d
; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI12_1]
; CHECK-GI-NEXT: adrp x8, .LCPI12_0
-; CHECK-GI-NEXT: cmgt v3.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT: cmgt v4.2d, v2.2d, v0.2d
; CHECK-GI-NEXT: cmgt v5.2d, v2.2d, v1.2d
-; CHECK-GI-NEXT: bif v0.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT: bif v0.16b, v2.16b, v4.16b
; CHECK-GI-NEXT: bif v1.16b, v2.16b, v5.16b
-; CHECK-GI-NEXT: cmgt v5.2d, v2.2d, v4.2d
-; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI12_0]
-; CHECK-GI-NEXT: bit v2.16b, v4.16b, v5.16b
-; CHECK-GI-NEXT: cmgt v6.2d, v0.2d, v3.2d
-; CHECK-GI-NEXT: cmgt v7.2d, v1.2d, v3.2d
-; CHECK-GI-NEXT: bif v0.16b, v3.16b, v6.16b
-; CHECK-GI-NEXT: bif v1.16b, v3.16b, v7.16b
; CHECK-GI-NEXT: cmgt v4.2d, v2.2d, v3.2d
-; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT: mov v1.16b, v4.16b
-; CHECK-GI-NEXT: bsl v1.16b, v2.16b, v3.16b
-; CHECK-GI-NEXT: mov s2, v0.s[1]
-; CHECK-GI-NEXT: mov s3, v0.s[2]
-; CHECK-GI-NEXT: mov s4, v0.s[3]
-; CHECK-GI-NEXT: fmov w0, s0
-; CHECK-GI-NEXT: mov v1.s[0], v1.s[0]
-; CHECK-GI-NEXT: fmov w1, s2
-; CHECK-GI-NEXT: fmov w2, s3
-; CHECK-GI-NEXT: fmov w3, s4
-; CHECK-GI-NEXT: fmov w4, s1
+; CHECK-GI-NEXT: ldr q5, [x8, :lo12:.LCPI12_0]
+; CHECK-GI-NEXT: bit v2.16b, v3.16b, v4.16b
+; CHECK-GI-NEXT: cmgt v3.2d, v0.2d, v5.2d
+; CHECK-GI-NEXT: cmgt v4.2d, v1.2d, v5.2d
+; CHECK-GI-NEXT: bif v0.16b, v5.16b, v3.16b
+; CHECK-GI-NEXT: bif v1.16b, v5.16b, v4.16b
+; CHECK-GI-NEXT: cmgt v3.2d, v2.2d, v5.2d
+; CHECK-GI-NEXT: bif v2.16b, v5.16b, v3.16b
+; CHECK-GI-NEXT: mov d3, v0.d[1]
+; CHECK-GI-NEXT: mov d4, v1.d[1]
+; CHECK-GI-NEXT: fmov x0, d0
+; CHECK-GI-NEXT: fmov x2, d1
+; CHECK-GI-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-GI-NEXT: // kill: def $w2 killed $w2 killed $x2
+; CHECK-GI-NEXT: fmov x4, d2
+; CHECK-GI-NEXT: fmov x1, d3
+; CHECK-GI-NEXT: fmov x3, d4
+; CHECK-GI-NEXT: // kill: def $w4 killed $w4 killed $x4
+; CHECK-GI-NEXT: // kill: def $w1 killed $w1 killed $x1
+; CHECK-GI-NEXT: // kill: def $w3 killed $w3 killed $x3
; CHECK-GI-NEXT: ret
%x = call <5 x i32> @llvm.fptosi.sat.v5f64.v5i32(<5 x double> %f)
ret <5 x i32> %x
@@ -444,49 +445,49 @@ define <6 x i32> @test_signed_v6f64_v6i32(<6 x double> %f) {
;
; CHECK-GI-LABEL: test_signed_v6f64_v6i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d4 killed $d4 def $q4
-; CHECK-GI-NEXT: // kill: def $d5 killed $d5 def $q5
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-GI-NEXT: // kill: def $d4 killed $d4 def $q4
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3
+; CHECK-GI-NEXT: // kill: def $d5 killed $d5 def $q5
; CHECK-GI-NEXT: adrp x8, .LCPI13_1
-; CHECK-GI-NEXT: mov v4.d[1], v5.d[0]
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: mov v2.d[1], v3.d[0]
+; CHECK-GI-NEXT: mov v4.d[1], v5.d[0]
; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI13_1]
; CHECK-GI-NEXT: adrp x8, .LCPI13_0
-; CHECK-GI-NEXT: ldr q6, [x8, :lo12:.LCPI13_0]
-; CHECK-GI-NEXT: fcvtzs v1.2d, v4.2d
; CHECK-GI-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-GI-NEXT: fcvtzs v2.2d, v2.2d
-; CHECK-GI-NEXT: cmgt v4.2d, v3.2d, v1.2d
-; CHECK-GI-NEXT: cmgt v5.2d, v3.2d, v2.2d
-; CHECK-GI-NEXT: bif v1.16b, v3.16b, v4.16b
+; CHECK-GI-NEXT: fcvtzs v1.2d, v2.2d
+; CHECK-GI-NEXT: fcvtzs v2.2d, v4.2d
; CHECK-GI-NEXT: cmgt v4.2d, v3.2d, v0.2d
-; CHECK-GI-NEXT: bif v2.16b, v3.16b, v5.16b
+; CHECK-GI-NEXT: cmgt v5.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT: cmgt v6.2d, v3.2d, v2.2d
; CHECK-GI-NEXT: bif v0.16b, v3.16b, v4.16b
-; CHECK-GI-NEXT: cmgt v3.2d, v1.2d, v6.2d
-; CHECK-GI-NEXT: cmgt v4.2d, v2.2d, v6.2d
-; CHECK-GI-NEXT: bif v1.16b, v6.16b, v3.16b
-; CHECK-GI-NEXT: cmgt v3.2d, v0.2d, v6.2d
-; CHECK-GI-NEXT: bif v2.16b, v6.16b, v4.16b
-; CHECK-GI-NEXT: bif v0.16b, v6.16b, v3.16b
-; CHECK-GI-NEXT: mov d3, v1.d[1]
-; CHECK-GI-NEXT: mov v1.s[0], v1.s[0]
-; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v2.4s
-; CHECK-GI-NEXT: fmov x8, d3
-; CHECK-GI-NEXT: mov v1.s[1], w8
-; CHECK-GI-NEXT: mov s2, v0.s[1]
-; CHECK-GI-NEXT: mov s3, v0.s[2]
-; CHECK-GI-NEXT: mov s4, v0.s[3]
-; CHECK-GI-NEXT: fmov w0, s0
-; CHECK-GI-NEXT: mov s5, v1.s[1]
-; CHECK-GI-NEXT: fmov w1, s2
-; CHECK-GI-NEXT: fmov w2, s3
-; CHECK-GI-NEXT: fmov w3, s4
-; CHECK-GI-NEXT: fmov w4, s1
-; CHECK-GI-NEXT: fmov w5, s5
+; CHECK-GI-NEXT: bif v1.16b, v3.16b, v5.16b
+; CHECK-GI-NEXT: bif v2.16b, v3.16b, v6.16b
+; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI13_0]
+; CHECK-GI-NEXT: cmgt v4.2d, v0.2d, v3.2d
+; CHECK-GI-NEXT: cmgt v5.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT: cmgt v6.2d, v2.2d, v3.2d
+; CHECK-GI-NEXT: bif v0.16b, v3.16b, v4.16b
+; CHECK-GI-NEXT: bif v1.16b, v3.16b, v5.16b
+; CHECK-GI-NEXT: bif v2.16b, v3.16b, v6.16b
+; CHECK-GI-NEXT: mov d3, v0.d[1]
+; CHECK-GI-NEXT: mov d4, v1.d[1]
+; CHECK-GI-NEXT: mov d5, v2.d[1]
+; CHECK-GI-NEXT: fmov x0, d0
+; CHECK-GI-NEXT: fmov x2, d1
+; CHECK-GI-NEXT: fmov x4, d2
+; CHECK-GI-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-GI-NEXT: // kill: def $w2 killed $w2 killed $x2
+; CHECK-GI-NEXT: // kill: def $w4 killed $w4 killed $x4
+; CHECK-GI-NEXT: fmov x1, d3
+; CHECK-GI-NEXT: fmov x3, d4
+; CHECK-GI-NEXT: fmov x5, d5
+; CHECK-GI-NEXT: // kill: def $w1 killed $w1 killed $x1
+; CHECK-GI-NEXT: // kill: def $w3 killed $w3 killed $x3
+; CHECK-GI-NEXT: // kill: def $w5 killed $w5 killed $x5
; CHECK-GI-NEXT: ret
%x = call <6 x i32> @llvm.fptosi.sat.v6f64.v6i32(<6 x double> %f)
ret <6 x i32> %x
@@ -1285,22 +1286,18 @@ define <5 x i32> @test_signed_v5f16_v5i32(<5 x half> %f) {
;
; CHECK-GI-LABEL: test_signed_v5f16_v5i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
-; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-GI-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT: fcvtl v1.4s, v0.4h
; CHECK-GI-NEXT: mov v0.h[0], v0.h[4]
-; CHECK-GI-NEXT: fcvtl v1.4s, v1.4h
-; CHECK-GI-NEXT: fcvtl v0.4s, v0.4h
; CHECK-GI-NEXT: fcvtzs v1.4s, v1.4s
-; CHECK-GI-NEXT: fcvtzs v0.4s, v0.4s
+; CHECK-GI-NEXT: fcvtl v0.4s, v0.4h
; CHECK-GI-NEXT: mov s2, v1.s[1]
+; CHECK-GI-NEXT: fcvtzs v0.4s, v0.4s
; CHECK-GI-NEXT: mov s3, v1.s[2]
; CHECK-GI-NEXT: mov s4, v1.s[3]
; CHECK-GI-NEXT: fmov w0, s1
-; CHECK-GI-NEXT: fmov w4, s0
; CHECK-GI-NEXT: fmov w1, s2
; CHECK-GI-NEXT: fmov w2, s3
+; CHECK-GI-NEXT: fmov w4, s0
; CHECK-GI-NEXT: fmov w3, s4
; CHECK-GI-NEXT: ret
%x = call <5 x i32> @llvm.fptosi.sat.v5f16.v5i32(<5 x half> %f)
@@ -1324,26 +1321,22 @@ define <6 x i32> @test_signed_v6f16_v6i32(<6 x half> %f) {
;
; CHECK-GI-LABEL: test_signed_v6f16_v6i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
-; CHECK-GI-NEXT: mov v2.h[0], v0.h[4]
-; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-GI-NEXT: mov v2.h[1], v0.h[5]
-; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-GI-NEXT: mov v1.h[3], v0.h[3]
-; CHECK-GI-NEXT: fcvtl v0.4s, v1.4h
-; CHECK-GI-NEXT: fcvtl v1.4s, v2.4h
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[4]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[5]
+; CHECK-GI-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-GI-NEXT: fcvtl v1.4s, v1.4h
; CHECK-GI-NEXT: fcvtzs v0.4s, v0.4s
; CHECK-GI-NEXT: fcvtzs v1.4s, v1.4s
; CHECK-GI-NEXT: mov s2, v0.s[1]
; CHECK-GI-NEXT: mov s3, v0.s[2]
-; CHECK-GI-NEXT: mov s4, v1.s[1]
-; CHECK-GI-NEXT: mov s5, v0.s[3]
+; CHECK-GI-NEXT: mov s4, v0.s[3]
; CHECK-GI-NEXT: fmov w0, s0
-; CHECK-GI-NEXT: fmov w4, s1
+; CHECK-GI-NEXT: mov s5, v1.s[1]
; CHECK-GI-NEXT: fmov w1, s2
; CHECK-GI-NEXT: fmov w2, s3
-; CHECK-GI-NEXT: fmov w5, s4
-; CHECK-GI-NEXT: fmov w3, s5
+; CHECK-GI-NEXT: fmov w3, s4
+; CHECK-GI-NEXT: fmov w4, s1
+; CHECK-GI-NEXT: fmov w5, s5
; CHECK-GI-NEXT: ret
%x = call <6 x i32> @llvm.fptosi.sat.v6f16.v6i32(<6 x half> %f)
ret <6 x i32> %x
@@ -1367,27 +1360,23 @@ define <7 x i32> @test_signed_v7f16_v7i32(<7 x half> %f) {
;
; CHECK-GI-LABEL: test_signed_v7f16_v7i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
-; CHECK-GI-NEXT: mov v2.h[0], v0.h[4]
-; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-GI-NEXT: mov v2.h[1], v0.h[5]
-; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-GI-NEXT: mov v2.h[2], v0.h[6]
-; CHECK-GI-NEXT: mov v1.h[3], v0.h[3]
-; CHECK-GI-NEXT: fcvtl v0.4s, v1.4h
-; CHECK-GI-NEXT: fcvtl v1.4s, v2.4h
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[4]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[5]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[6]
+; CHECK-GI-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-GI-NEXT: fcvtl v1.4s, v1.4h
; CHECK-GI-NEXT: fcvtzs v0.4s, v0.4s
; CHECK-GI-NEXT: fcvtzs v1.4s, v1.4s
; CHECK-GI-NEXT: mov s2, v0.s[1]
; CHECK-GI-NEXT: mov s3, v0.s[2]
; CHECK-GI-NEXT: mov s4, v0.s[3]
+; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: mov s5, v1.s[1]
; CHECK-GI-NEXT: mov s6, v1.s[2]
-; CHECK-GI-NEXT: fmov w0, s0
-; CHECK-GI-NEXT: fmov w4, s1
; CHECK-GI-NEXT: fmov w1, s2
; CHECK-GI-NEXT: fmov w2, s3
; CHECK-GI-NEXT: fmov w3, s4
+; CHECK-GI-NEXT: fmov w4, s1
; CHECK-GI-NEXT: fmov w5, s5
; CHECK-GI-NEXT: fmov w6, s6
; CHECK-GI-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index eb68125080f33a..a86c41a7b7edd7 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -378,25 +378,27 @@ define <5 x i32> @test_unsigned_v5f64_v5i32(<5 x double> %f) {
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: mov v2.d[1], v3.d[0]
; CHECK-GI-NEXT: movi v1.2d, #0x000000ffffffff
-; CHECK-GI-NEXT: fcvtzu v4.2d, v4.2d
+; CHECK-GI-NEXT: fcvtzu v3.2d, v4.2d
; CHECK-GI-NEXT: fcvtzu v0.2d, v0.2d
; CHECK-GI-NEXT: fcvtzu v2.2d, v2.2d
-; CHECK-GI-NEXT: cmhi v3.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT: cmhi v4.2d, v1.2d, v0.2d
; CHECK-GI-NEXT: cmhi v5.2d, v1.2d, v2.2d
-; CHECK-GI-NEXT: bif v0.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT: bif v0.16b, v1.16b, v4.16b
; CHECK-GI-NEXT: bif v2.16b, v1.16b, v5.16b
-; CHECK-GI-NEXT: cmhi v3.2d, v1.2d, v4.2d
-; CHECK-GI-NEXT: bit v1.16b, v4.16b, v3.16b
-; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v2.4s
-; CHECK-GI-NEXT: mov v1.s[0], v1.s[0]
-; CHECK-GI-NEXT: mov s2, v0.s[1]
-; CHECK-GI-NEXT: mov s3, v0.s[2]
-; CHECK-GI-NEXT: mov s4, v0.s[3]
-; CHECK-GI-NEXT: fmov w0, s0
-; CHECK-GI-NEXT: fmov w4, s1
-; CHECK-GI-NEXT: fmov w1, s2
-; CHECK-GI-NEXT: fmov w2, s3
-; CHECK-GI-NEXT: fmov w3, s4
+; CHECK-GI-NEXT: cmhi v4.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT: bit v1.16b, v3.16b, v4.16b
+; CHECK-GI-NEXT: mov d3, v0.d[1]
+; CHECK-GI-NEXT: mov d4, v2.d[1]
+; CHECK-GI-NEXT: fmov x0, d0
+; CHECK-GI-NEXT: fmov x2, d2
+; CHECK-GI-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-GI-NEXT: // kill: def $w2 killed $w2 killed $x2
+; CHECK-GI-NEXT: fmov x4, d1
+; CHECK-GI-NEXT: fmov x1, d3
+; CHECK-GI-NEXT: fmov x3, d4
+; CHECK-GI-NEXT: // kill: def $w4 killed $w4 killed $x4
+; CHECK-GI-NEXT: // kill: def $w1 killed $w1 killed $x1
+; CHECK-GI-NEXT: // kill: def $w3 killed $w3 killed $x3
; CHECK-GI-NEXT: ret
%x = call <5 x i32> @llvm.fptoui.sat.v5f64.v5i32(<5 x double> %f)
ret <5 x i32> %x
@@ -415,40 +417,40 @@ define <6 x i32> @test_unsigned_v6f64_v6i32(<6 x double> %f) {
;
; CHECK-GI-LABEL: test_unsigned_v6f64_v6i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d4 killed $d4 def $q4
-; CHECK-GI-NEXT: // kill: def $d5 killed $d5 def $q5
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-GI-NEXT: // kill: def $d4 killed $d4 def $q4
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3
-; CHECK-GI-NEXT: mov v4.d[1], v5.d[0]
+; CHECK-GI-NEXT: // kill: def $d5 killed $d5 def $q5
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: mov v2.d[1], v3.d[0]
-; CHECK-GI-NEXT: movi v3.2d, #0x000000ffffffff
-; CHECK-GI-NEXT: fcvtzu v1.2d, v4.2d
+; CHECK-GI-NEXT: mov v4.d[1], v5.d[0]
+; CHECK-GI-NEXT: movi v1.2d, #0x000000ffffffff
; CHECK-GI-NEXT: fcvtzu v0.2d, v0.2d
; CHECK-GI-NEXT: fcvtzu v2.2d, v2.2d
-; CHECK-GI-NEXT: cmhi v4.2d, v3.2d, v1.2d
-; CHECK-GI-NEXT: cmhi v5.2d, v3.2d, v2.2d
-; CHECK-GI-NEXT: bif v1.16b, v3.16b, v4.16b
-; CHECK-GI-NEXT: cmhi v4.2d, v3.2d, v0.2d
-; CHECK-GI-NEXT: bif v2.16b, v3.16b, v5.16b
-; CHECK-GI-NEXT: bif v0.16b, v3.16b, v4.16b
-; CHECK-GI-NEXT: mov d3, v1.d[1]
-; CHECK-GI-NEXT: mov v1.s[0], v1.s[0]
-; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v2.4s
-; CHECK-GI-NEXT: fmov x8, d3
-; CHECK-GI-NEXT: mov v1.s[1], w8
-; CHECK-GI-NEXT: mov s2, v0.s[1]
-; CHECK-GI-NEXT: mov s3, v0.s[2]
-; CHECK-GI-NEXT: mov s4, v0.s[3]
-; CHECK-GI-NEXT: fmov w0, s0
-; CHECK-GI-NEXT: mov s5, v1.s[1]
-; CHECK-GI-NEXT: fmov w1, s2
-; CHECK-GI-NEXT: fmov w2, s3
-; CHECK-GI-NEXT: fmov w3, s4
-; CHECK-GI-NEXT: fmov w4, s1
-; CHECK-GI-NEXT: fmov w5, s5
+; CHECK-GI-NEXT: fcvtzu v3.2d, v4.2d
+; CHECK-GI-NEXT: cmhi v4.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT: cmhi v5.2d, v1.2d, v2.2d
+; CHECK-GI-NEXT: cmhi v6.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT: bif v0.16b, v1.16b, v4.16b
+; CHECK-GI-NEXT: bif v2.16b, v1.16b, v5.16b
+; CHECK-GI-NEXT: bit v1.16b, v3.16b, v6.16b
+; CHECK-GI-NEXT: mov d3, v0.d[1]
+; CHECK-GI-NEXT: mov d4, v2.d[1]
+; CHECK-GI-NEXT: mov d5, v1.d[1]
+; CHECK-GI-NEXT: fmov x0, d0
+; CHECK-GI-NEXT: fmov x2, d2
+; CHECK-GI-NEXT: fmov x4, d1
+; CHECK-GI-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-GI-NEXT: // kill: def $w2 killed $w2 killed $x2
+; CHECK-GI-NEXT: // kill: def $w4 killed $w4 killed $x4
+; CHECK-GI-NEXT: fmov x1, d3
+; CHECK-GI-NEXT: fmov x3, d4
+; CHECK-GI-NEXT: fmov x5, d5
+; CHECK-GI-NEXT: // kill: def $w1 killed $w1 killed $x1
+; CHECK-GI-NEXT: // kill: def $w3 killed $w3 killed $x3
+; CHECK-GI-NEXT: // kill: def $w5 killed $w5 killed $x5
; CHECK-GI-NEXT: ret
%x = call <6 x i32> @llvm.fptoui.sat.v6f64.v6i32(<6 x double> %f)
ret <6 x i32> %x
@@ -1115,22 +1117,18 @@ define <5 x i32> @test_unsigned_v5f16_v5i32(<5 x half> %f) {
;
; CHECK-GI-LABEL: test_unsigned_v5f16_v5i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
-; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-GI-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT: fcvtl v1.4s, v0.4h
; CHECK-GI-NEXT: mov v0.h[0], v0.h[4]
-; CHECK-GI-NEXT: fcvtl v1.4s, v1.4h
-; CHECK-GI-NEXT: fcvtl v0.4s, v0.4h
; CHECK-GI-NEXT: fcvtzu v1.4s, v1.4s
-; CHECK-GI-NEXT: fcvtzu v0.4s, v0.4s
+; CHECK-GI-NEXT: fcvtl v0.4s, v0.4h
; CHECK-GI-NEXT: mov s2, v1.s[1]
+; CHECK-GI-NEXT: fcvtzu v0.4s, v0.4s
; CHECK-GI-NEXT: mov s3, v1.s[2]
; CHECK-GI-NEXT: mov s4, v1.s[3]
; CHECK-GI-NEXT: fmov w0, s1
-; CHECK-GI-NEXT: fmov w4, s0
; CHECK-GI-NEXT: fmov w1, s2
; CHECK-GI-NEXT: fmov w2, s3
+; CHECK-GI-NEXT: fmov w4, s0
; CHECK-GI-NEXT: fmov w3, s4
; CHECK-GI-NEXT: ret
%x = call <5 x i32> @llvm.fptoui.sat.v5f16.v5i32(<5 x half> %f)
@@ -1154,26 +1152,22 @@ define <6 x i32> @test_unsigned_v6f16_v6i32(<6 x half> %f) {
;
; CHECK-GI-LABEL: test_unsigned_v6f16_v6i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
-; CHECK-GI-NEXT: mov v2.h[0], v0.h[4]
-; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-GI-NEXT: mov v2.h[1], v0.h[5]
-; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-GI-NEXT: mov v1.h[3], v0.h[3]
-; CHECK-GI-NEXT: fcvtl v0.4s, v1.4h
-; CHECK-GI-NEXT: fcvtl v1.4s, v2.4h
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[4]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[5]
+; CHECK-GI-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-GI-NEXT: fcvtl v1.4s, v1.4h
; CHECK-GI-NEXT: fcvtzu v0.4s, v0.4s
; CHECK-GI-NEXT: fcvtzu v1.4s, v1.4s
; CHECK-GI-NEXT: mov s2, v0.s[1]
; CHECK-GI-NEXT: mov s3, v0.s[2]
-; CHECK-GI-NEXT: mov s4, v1.s[1]
-; CHECK-GI-NEXT: mov s5, v0.s[3]
+; CHECK-GI-NEXT: mov s4, v0.s[3]
; CHECK-GI-NEXT: fmov w0, s0
-; CHECK-GI-NEXT: fmov w4, s1
+; CHECK-GI-NEXT: mov s5, v1.s[1]
; CHECK-GI-NEXT: fmov w1, s2
; CHECK-GI-NEXT: fmov w2, s3
-; CHECK-GI-NEXT: fmov w5, s4
-; CHECK-GI-NEXT: fmov w3, s5
+; CHECK-GI-NEXT: fmov w3, s4
+; CHECK-GI-NEXT: fmov w4, s1
+; CHECK-GI-NEXT: fmov w5, s5
; CHECK-GI-NEXT: ret
%x = call <6 x i32> @llvm.fptoui.sat.v6f16.v6i32(<6 x half> %f)
ret <6 x i32> %x
@@ -1197,27 +1191,23 @@ define <7 x i32> @test_unsigned_v7f16_v7i32(<7 x half> %f) {
;
; CHECK-GI-LABEL: test_unsigned_v7f16_v7i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
-; CHECK-GI-NEXT: mov v2.h[0], v0.h[4]
-; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-GI-NEXT: mov v2.h[1], v0.h[5]
-; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-GI-NEXT: mov v2.h[2], v0.h[6]
-; CHECK-GI-NEXT: mov v1.h[3], v0.h[3]
-; CHECK-GI-NEXT: fcvtl v0.4s, v1.4h
-; CHECK-GI-NEXT: fcvtl v1.4s, v2.4h
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[4]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[5]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[6]
+; CHECK-GI-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-GI-NEXT: fcvtl v1.4s, v1.4h
; CHECK-GI-NEXT: fcvtzu v0.4s, v0.4s
; CHECK-GI-NEXT: fcvtzu v1.4s, v1.4s
; CHECK-GI-NEXT: mov s2, v0.s[1]
; CHECK-GI-NEXT: mov s3, v0.s[2]
; CHECK-GI-NEXT: mov s4, v0.s[3]
+; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: mov s5, v1.s[1]
; CHECK-GI-NEXT: mov s6, v1.s[2]
-; CHECK-GI-NEXT: fmov w0, s0
-; CHECK-GI-NEXT: fmov w4, s1
; CHECK-GI-NEXT: fmov w1, s2
; CHECK-GI-NEXT: fmov w2, s3
; CHECK-GI-NEXT: fmov w3, s4
+; CHECK-GI-NEXT: fmov w4, s1
; CHECK-GI-NEXT: fmov w5, s5
; CHECK-GI-NEXT: fmov w6, s6
; CHECK-GI-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/frem.ll b/llvm/test/CodeGen/AArch64/frem.ll
index ad8576c63b1aea..cc2443497ad83b 100644
--- a/llvm/test/CodeGen/AArch64/frem.ll
+++ b/llvm/test/CodeGen/AArch64/frem.ll
@@ -886,13 +886,7 @@ define <7 x half> @frem_v7f16(<7 x half> %a, <7 x half> %b) {
; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT: mov v0.16b, v1.16b
; CHECK-GI-NEXT: add sp, sp, #176
; CHECK-GI-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/fsincos.ll b/llvm/test/CodeGen/AArch64/fsincos.ll
index eac17ec72bc990..4136dfe010eadd 100644
--- a/llvm/test/CodeGen/AArch64/fsincos.ll
+++ b/llvm/test/CodeGen/AArch64/fsincos.ll
@@ -731,13 +731,7 @@ define <7 x half> @sin_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT: mov v0.16b, v1.16b
; CHECK-GI-NEXT: add sp, sp, #160
; CHECK-GI-NEXT: ret
entry:
@@ -2045,13 +2039,7 @@ define <7 x half> @cos_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT: mov v0.16b, v1.16b
; CHECK-GI-NEXT: add sp, sp, #160
; CHECK-GI-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/fsqrt.ll b/llvm/test/CodeGen/AArch64/fsqrt.ll
index 15e93e244f1d5c..7514c9235b0397 100644
--- a/llvm/test/CodeGen/AArch64/fsqrt.ll
+++ b/llvm/test/CodeGen/AArch64/fsqrt.ll
@@ -207,52 +207,27 @@ define <7 x half> @sqrt_v7f16(<7 x half> %a) {
;
; CHECK-GI-NOFP16-LABEL: sqrt_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NOFP16-NEXT: fsqrt v1.4s, v1.4s
; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2]
; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v0.h[3]
; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: fsqrt v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT: fsqrt v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[4], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[5], v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[6], v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v2.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v2.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[4]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[6]
+; CHECK-GI-NOFP16-NEXT: fsqrt v2.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: sqrt_v7f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0]
-; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-GI-FP16-NEXT: mov v1.h[3], v0.h[3]
-; CHECK-GI-FP16-NEXT: mov v1.h[4], v0.h[4]
-; CHECK-GI-FP16-NEXT: mov v1.h[5], v0.h[5]
-; CHECK-GI-FP16-NEXT: mov v1.h[6], v0.h[6]
-; CHECK-GI-FP16-NEXT: fsqrt v1.8h, v1.8h
-; CHECK-GI-FP16-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-FP16-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-FP16-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-FP16-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-FP16-NEXT: fsqrt v0.8h, v0.8h
; CHECK-GI-FP16-NEXT: ret
entry:
%c = call <7 x half> @llvm.sqrt.v7f16(<7 x half> %a)
diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll
index 517cf7c4352fd3..a93a089cda3be2 100644
--- a/llvm/test/CodeGen/AArch64/load.ll
+++ b/llvm/test/CodeGen/AArch64/load.ll
@@ -238,38 +238,20 @@ define <7 x i8> @load_v7i8(ptr %ptr){
;
; CHECK-GI-LABEL: load_v7i8:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldrb w8, [x0]
-; CHECK-GI-NEXT: ldrb w9, [x0, #1]
-; CHECK-GI-NEXT: fmov s0, w8
-; CHECK-GI-NEXT: ldrb w8, [x0, #2]
-; CHECK-GI-NEXT: mov v0.h[1], w9
-; CHECK-GI-NEXT: mov v0.h[2], w8
-; CHECK-GI-NEXT: ldrb w8, [x0, #3]
-; CHECK-GI-NEXT: mov v0.h[3], w8
-; CHECK-GI-NEXT: ldrb w8, [x0, #4]
-; CHECK-GI-NEXT: mov v0.h[4], w8
-; CHECK-GI-NEXT: ldrb w8, [x0, #5]
-; CHECK-GI-NEXT: mov v0.h[5], w8
-; CHECK-GI-NEXT: ldrb w8, [x0, #6]
-; CHECK-GI-NEXT: mov v0.h[6], w8
-; CHECK-GI-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NEXT: mov h2, v0.h[3]
-; CHECK-GI-NEXT: mov h3, v0.h[4]
-; CHECK-GI-NEXT: mov h4, v0.h[5]
-; CHECK-GI-NEXT: mov h5, v0.h[6]
-; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: mov h1, v0.h[2]
-; CHECK-GI-NEXT: mov v0.b[1], w8
-; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: mov v0.b[2], w8
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov v0.b[3], w8
-; CHECK-GI-NEXT: fmov w8, s3
-; CHECK-GI-NEXT: mov v0.b[4], w8
-; CHECK-GI-NEXT: fmov w8, s4
-; CHECK-GI-NEXT: mov v0.b[5], w8
-; CHECK-GI-NEXT: fmov w8, s5
-; CHECK-GI-NEXT: mov v0.b[6], w8
+; CHECK-GI-NEXT: ldr b0, [x0]
+; CHECK-GI-NEXT: ldr b1, [x0, #1]
+; CHECK-GI-NEXT: mov v0.b[0], v0.b[0]
+; CHECK-GI-NEXT: mov v0.b[1], v1.b[0]
+; CHECK-GI-NEXT: ldr b1, [x0, #2]
+; CHECK-GI-NEXT: mov v0.b[2], v1.b[0]
+; CHECK-GI-NEXT: ldr b1, [x0, #3]
+; CHECK-GI-NEXT: mov v0.b[3], v1.b[0]
+; CHECK-GI-NEXT: ldr b1, [x0, #4]
+; CHECK-GI-NEXT: mov v0.b[4], v1.b[0]
+; CHECK-GI-NEXT: ldr b1, [x0, #5]
+; CHECK-GI-NEXT: mov v0.b[5], v1.b[0]
+; CHECK-GI-NEXT: ldr b1, [x0, #6]
+; CHECK-GI-NEXT: mov v0.b[6], v1.b[0]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
%a = load <7 x i8>, ptr %ptr
@@ -307,26 +289,19 @@ define <7 x i16> @load_v7i16(ptr %ptr){
;
; CHECK-GI-LABEL: load_v7i16:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr h1, [x0]
-; CHECK-GI-NEXT: ldr h0, [x0, #2]
+; CHECK-GI-NEXT: ldr h0, [x0]
+; CHECK-GI-NEXT: add x8, x0, #2
+; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
; CHECK-GI-NEXT: add x8, x0, #4
-; CHECK-GI-NEXT: mov v1.h[1], v0.h[0]
-; CHECK-GI-NEXT: ld1 { v1.h }[2], [x8]
+; CHECK-GI-NEXT: ld1 { v0.h }[2], [x8]
; CHECK-GI-NEXT: add x8, x0, #6
-; CHECK-GI-NEXT: ld1 { v1.h }[3], [x8]
+; CHECK-GI-NEXT: ld1 { v0.h }[3], [x8]
; CHECK-GI-NEXT: add x8, x0, #8
-; CHECK-GI-NEXT: ld1 { v1.h }[4], [x8]
+; CHECK-GI-NEXT: ld1 { v0.h }[4], [x8]
; CHECK-GI-NEXT: add x8, x0, #10
-; CHECK-GI-NEXT: ld1 { v1.h }[5], [x8]
+; CHECK-GI-NEXT: ld1 { v0.h }[5], [x8]
; CHECK-GI-NEXT: add x8, x0, #12
-; CHECK-GI-NEXT: ld1 { v1.h }[6], [x8]
-; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
+; CHECK-GI-NEXT: ld1 { v0.h }[6], [x8]
; CHECK-GI-NEXT: ret
%a = load <7 x i16>, ptr %ptr
ret <7 x i16> %a
diff --git a/llvm/test/CodeGen/AArch64/shift.ll b/llvm/test/CodeGen/AArch64/shift.ll
index c8344a39da56a7..a9517383cae0db 100644
--- a/llvm/test/CodeGen/AArch64/shift.ll
+++ b/llvm/test/CodeGen/AArch64/shift.ll
@@ -1086,80 +1086,10 @@ define <3 x i8> @shl_v3i8(<3 x i8> %0, <3 x i8> %1){
}
define <7 x i8> @shl_v7i8(<7 x i8> %0, <7 x i8> %1){
-; CHECK-SD-LABEL: shl_v7i8:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ushl v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: shl_v7i8:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov b3, v0.b[1]
-; CHECK-GI-NEXT: mov b4, v1.b[1]
-; CHECK-GI-NEXT: mov v2.b[0], v0.b[0]
-; CHECK-GI-NEXT: mov v5.b[0], v1.b[0]
-; CHECK-GI-NEXT: mov b6, v0.b[2]
-; CHECK-GI-NEXT: mov b7, v1.b[2]
-; CHECK-GI-NEXT: mov v2.b[1], v3.b[0]
-; CHECK-GI-NEXT: mov b3, v0.b[3]
-; CHECK-GI-NEXT: mov v5.b[1], v4.b[0]
-; CHECK-GI-NEXT: mov b4, v1.b[3]
-; CHECK-GI-NEXT: mov v2.b[2], v6.b[0]
-; CHECK-GI-NEXT: mov b6, v0.b[4]
-; CHECK-GI-NEXT: mov v5.b[2], v7.b[0]
-; CHECK-GI-NEXT: mov b7, v1.b[4]
-; CHECK-GI-NEXT: mov v2.b[3], v3.b[0]
-; CHECK-GI-NEXT: mov b3, v0.b[5]
-; CHECK-GI-NEXT: mov b0, v0.b[6]
-; CHECK-GI-NEXT: mov v5.b[3], v4.b[0]
-; CHECK-GI-NEXT: mov b4, v1.b[5]
-; CHECK-GI-NEXT: mov b1, v1.b[6]
-; CHECK-GI-NEXT: mov v2.b[4], v6.b[0]
-; CHECK-GI-NEXT: mov v5.b[4], v7.b[0]
-; CHECK-GI-NEXT: mov v2.b[5], v3.b[0]
-; CHECK-GI-NEXT: mov v5.b[5], v4.b[0]
-; CHECK-GI-NEXT: mov v2.b[6], v0.b[0]
-; CHECK-GI-NEXT: mov v5.b[6], v1.b[0]
-; CHECK-GI-NEXT: ushl v0.8b, v2.8b, v5.8b
-; CHECK-GI-NEXT: mov b1, v0.b[1]
-; CHECK-GI-NEXT: mov b2, v0.b[2]
-; CHECK-GI-NEXT: mov b3, v0.b[3]
-; CHECK-GI-NEXT: mov b4, v0.b[4]
-; CHECK-GI-NEXT: mov b5, v0.b[6]
-; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: mov b1, v0.b[5]
-; CHECK-GI-NEXT: mov v0.h[1], w8
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov v0.h[2], w8
-; CHECK-GI-NEXT: fmov w8, s3
-; CHECK-GI-NEXT: mov v0.h[3], w8
-; CHECK-GI-NEXT: fmov w8, s4
-; CHECK-GI-NEXT: mov v0.h[4], w8
-; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: mov v0.h[5], w8
-; CHECK-GI-NEXT: fmov w8, s5
-; CHECK-GI-NEXT: mov v0.h[6], w8
-; CHECK-GI-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NEXT: mov h2, v0.h[3]
-; CHECK-GI-NEXT: mov h3, v0.h[4]
-; CHECK-GI-NEXT: mov h4, v0.h[5]
-; CHECK-GI-NEXT: mov h5, v0.h[6]
-; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: mov h1, v0.h[2]
-; CHECK-GI-NEXT: mov v0.b[1], w8
-; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: mov v0.b[2], w8
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov v0.b[3], w8
-; CHECK-GI-NEXT: fmov w8, s3
-; CHECK-GI-NEXT: mov v0.b[4], w8
-; CHECK-GI-NEXT: fmov w8, s4
-; CHECK-GI-NEXT: mov v0.b[5], w8
-; CHECK-GI-NEXT: fmov w8, s5
-; CHECK-GI-NEXT: mov v0.b[6], w8
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: shl_v7i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushl v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%3 = shl <7 x i8> %0, %1
ret <7 x i8> %3
}
@@ -1198,36 +1128,10 @@ define <3 x i16> @shl_v3i16(<3 x i16> %0, <3 x i16> %1){
}
define <7 x i16> @shl_v7i16(<7 x i16> %0, <7 x i16> %1){
-; CHECK-SD-LABEL: shl_v7i16:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ushl v0.8h, v0.8h, v1.8h
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: shl_v7i16:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov v2.h[0], v0.h[0]
-; CHECK-GI-NEXT: mov v3.h[0], v1.h[0]
-; CHECK-GI-NEXT: mov v2.h[1], v0.h[1]
-; CHECK-GI-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-GI-NEXT: mov v2.h[2], v0.h[2]
-; CHECK-GI-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-GI-NEXT: mov v2.h[3], v0.h[3]
-; CHECK-GI-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-GI-NEXT: mov v2.h[4], v0.h[4]
-; CHECK-GI-NEXT: mov v3.h[4], v1.h[4]
-; CHECK-GI-NEXT: mov v2.h[5], v0.h[5]
-; CHECK-GI-NEXT: mov v3.h[5], v1.h[5]
-; CHECK-GI-NEXT: mov v2.h[6], v0.h[6]
-; CHECK-GI-NEXT: mov v3.h[6], v1.h[6]
-; CHECK-GI-NEXT: ushl v1.8h, v2.8h, v3.8h
-; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: shl_v7i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushl v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
%3 = shl <7 x i16> %0, %1
ret <7 x i16> %3
}
@@ -1301,82 +1205,11 @@ define <3 x i8> @ashr_v3i8(<3 x i8> %0, <3 x i8> %1){
}
define <7 x i8> @ashr_v7i8(<7 x i8> %0, <7 x i8> %1){
-; CHECK-SD-LABEL: ashr_v7i8:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: neg v1.8b, v1.8b
-; CHECK-SD-NEXT: sshl v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ashr_v7i8:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov b2, v1.b[1]
-; CHECK-GI-NEXT: mov v3.b[0], v1.b[0]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov b4, v0.b[1]
-; CHECK-GI-NEXT: mov v5.b[0], v0.b[0]
-; CHECK-GI-NEXT: mov b6, v1.b[2]
-; CHECK-GI-NEXT: mov v3.b[1], v2.b[0]
-; CHECK-GI-NEXT: mov b2, v0.b[2]
-; CHECK-GI-NEXT: mov v5.b[1], v4.b[0]
-; CHECK-GI-NEXT: mov b4, v1.b[3]
-; CHECK-GI-NEXT: mov v3.b[2], v6.b[0]
-; CHECK-GI-NEXT: mov b6, v0.b[3]
-; CHECK-GI-NEXT: mov v5.b[2], v2.b[0]
-; CHECK-GI-NEXT: mov b2, v1.b[4]
-; CHECK-GI-NEXT: mov v3.b[3], v4.b[0]
-; CHECK-GI-NEXT: mov b4, v0.b[4]
-; CHECK-GI-NEXT: mov v5.b[3], v6.b[0]
-; CHECK-GI-NEXT: mov b6, v1.b[5]
-; CHECK-GI-NEXT: mov b1, v1.b[6]
-; CHECK-GI-NEXT: mov v3.b[4], v2.b[0]
-; CHECK-GI-NEXT: mov b2, v0.b[5]
-; CHECK-GI-NEXT: mov b0, v0.b[6]
-; CHECK-GI-NEXT: mov v5.b[4], v4.b[0]
-; CHECK-GI-NEXT: mov v3.b[5], v6.b[0]
-; CHECK-GI-NEXT: mov v5.b[5], v2.b[0]
-; CHECK-GI-NEXT: mov v3.b[6], v1.b[0]
-; CHECK-GI-NEXT: mov v5.b[6], v0.b[0]
-; CHECK-GI-NEXT: neg v0.8b, v3.8b
-; CHECK-GI-NEXT: sshl v0.8b, v5.8b, v0.8b
-; CHECK-GI-NEXT: mov b1, v0.b[1]
-; CHECK-GI-NEXT: mov b2, v0.b[2]
-; CHECK-GI-NEXT: mov b3, v0.b[3]
-; CHECK-GI-NEXT: mov b4, v0.b[4]
-; CHECK-GI-NEXT: mov b5, v0.b[6]
-; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: mov b1, v0.b[5]
-; CHECK-GI-NEXT: mov v0.h[1], w8
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov v0.h[2], w8
-; CHECK-GI-NEXT: fmov w8, s3
-; CHECK-GI-NEXT: mov v0.h[3], w8
-; CHECK-GI-NEXT: fmov w8, s4
-; CHECK-GI-NEXT: mov v0.h[4], w8
-; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: mov v0.h[5], w8
-; CHECK-GI-NEXT: fmov w8, s5
-; CHECK-GI-NEXT: mov v0.h[6], w8
-; CHECK-GI-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NEXT: mov h2, v0.h[3]
-; CHECK-GI-NEXT: mov h3, v0.h[4]
-; CHECK-GI-NEXT: mov h4, v0.h[5]
-; CHECK-GI-NEXT: mov h5, v0.h[6]
-; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: mov h1, v0.h[2]
-; CHECK-GI-NEXT: mov v0.b[1], w8
-; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: mov v0.b[2], w8
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov v0.b[3], w8
-; CHECK-GI-NEXT: fmov w8, s3
-; CHECK-GI-NEXT: mov v0.b[4], w8
-; CHECK-GI-NEXT: fmov w8, s4
-; CHECK-GI-NEXT: mov v0.b[5], w8
-; CHECK-GI-NEXT: fmov w8, s5
-; CHECK-GI-NEXT: mov v0.b[6], w8
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ashr_v7i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: neg v1.8b, v1.8b
+; CHECK-NEXT: sshl v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%3 = ashr <7 x i8> %0, %1
ret <7 x i8> %3
}
@@ -1417,38 +1250,11 @@ define <3 x i16> @ashr_v3i16(<3 x i16> %0, <3 x i16> %1){
}
define <7 x i16> @ashr_v7i16(<7 x i16> %0, <7 x i16> %1){
-; CHECK-SD-LABEL: ashr_v7i16:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: neg v1.8h, v1.8h
-; CHECK-SD-NEXT: sshl v0.8h, v0.8h, v1.8h
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ashr_v7i16:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov v2.h[0], v1.h[0]
-; CHECK-GI-NEXT: mov v3.h[0], v0.h[0]
-; CHECK-GI-NEXT: mov v2.h[1], v1.h[1]
-; CHECK-GI-NEXT: mov v3.h[1], v0.h[1]
-; CHECK-GI-NEXT: mov v2.h[2], v1.h[2]
-; CHECK-GI-NEXT: mov v3.h[2], v0.h[2]
-; CHECK-GI-NEXT: mov v2.h[3], v1.h[3]
-; CHECK-GI-NEXT: mov v3.h[3], v0.h[3]
-; CHECK-GI-NEXT: mov v2.h[4], v1.h[4]
-; CHECK-GI-NEXT: mov v3.h[4], v0.h[4]
-; CHECK-GI-NEXT: mov v2.h[5], v1.h[5]
-; CHECK-GI-NEXT: mov v3.h[5], v0.h[5]
-; CHECK-GI-NEXT: mov v2.h[6], v1.h[6]
-; CHECK-GI-NEXT: mov v3.h[6], v0.h[6]
-; CHECK-GI-NEXT: neg v0.8h, v2.8h
-; CHECK-GI-NEXT: sshl v1.8h, v3.8h, v0.8h
-; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ashr_v7i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: neg v1.8h, v1.8h
+; CHECK-NEXT: sshl v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
%3 = ashr <7 x i16> %0, %1
ret <7 x i16> %3
}
@@ -1523,82 +1329,11 @@ define <3 x i8> @lshr_v3i8(<3 x i8> %0, <3 x i8> %1){
}
define <7 x i8> @lshr_v7i8(<7 x i8> %0, <7 x i8> %1){
-; CHECK-SD-LABEL: lshr_v7i8:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: neg v1.8b, v1.8b
-; CHECK-SD-NEXT: ushl v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: lshr_v7i8:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov b2, v1.b[1]
-; CHECK-GI-NEXT: mov v3.b[0], v1.b[0]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov b4, v0.b[1]
-; CHECK-GI-NEXT: mov v5.b[0], v0.b[0]
-; CHECK-GI-NEXT: mov b6, v1.b[2]
-; CHECK-GI-NEXT: mov v3.b[1], v2.b[0]
-; CHECK-GI-NEXT: mov b2, v0.b[2]
-; CHECK-GI-NEXT: mov v5.b[1], v4.b[0]
-; CHECK-GI-NEXT: mov b4, v1.b[3]
-; CHECK-GI-NEXT: mov v3.b[2], v6.b[0]
-; CHECK-GI-NEXT: mov b6, v0.b[3]
-; CHECK-GI-NEXT: mov v5.b[2], v2.b[0]
-; CHECK-GI-NEXT: mov b2, v1.b[4]
-; CHECK-GI-NEXT: mov v3.b[3], v4.b[0]
-; CHECK-GI-NEXT: mov b4, v0.b[4]
-; CHECK-GI-NEXT: mov v5.b[3], v6.b[0]
-; CHECK-GI-NEXT: mov b6, v1.b[5]
-; CHECK-GI-NEXT: mov b1, v1.b[6]
-; CHECK-GI-NEXT: mov v3.b[4], v2.b[0]
-; CHECK-GI-NEXT: mov b2, v0.b[5]
-; CHECK-GI-NEXT: mov b0, v0.b[6]
-; CHECK-GI-NEXT: mov v5.b[4], v4.b[0]
-; CHECK-GI-NEXT: mov v3.b[5], v6.b[0]
-; CHECK-GI-NEXT: mov v5.b[5], v2.b[0]
-; CHECK-GI-NEXT: mov v3.b[6], v1.b[0]
-; CHECK-GI-NEXT: mov v5.b[6], v0.b[0]
-; CHECK-GI-NEXT: neg v0.8b, v3.8b
-; CHECK-GI-NEXT: ushl v0.8b, v5.8b, v0.8b
-; CHECK-GI-NEXT: mov b1, v0.b[1]
-; CHECK-GI-NEXT: mov b2, v0.b[2]
-; CHECK-GI-NEXT: mov b3, v0.b[3]
-; CHECK-GI-NEXT: mov b4, v0.b[4]
-; CHECK-GI-NEXT: mov b5, v0.b[6]
-; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: mov b1, v0.b[5]
-; CHECK-GI-NEXT: mov v0.h[1], w8
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov v0.h[2], w8
-; CHECK-GI-NEXT: fmov w8, s3
-; CHECK-GI-NEXT: mov v0.h[3], w8
-; CHECK-GI-NEXT: fmov w8, s4
-; CHECK-GI-NEXT: mov v0.h[4], w8
-; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: mov v0.h[5], w8
-; CHECK-GI-NEXT: fmov w8, s5
-; CHECK-GI-NEXT: mov v0.h[6], w8
-; CHECK-GI-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NEXT: mov h2, v0.h[3]
-; CHECK-GI-NEXT: mov h3, v0.h[4]
-; CHECK-GI-NEXT: mov h4, v0.h[5]
-; CHECK-GI-NEXT: mov h5, v0.h[6]
-; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: mov h1, v0.h[2]
-; CHECK-GI-NEXT: mov v0.b[1], w8
-; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: mov v0.b[2], w8
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov v0.b[3], w8
-; CHECK-GI-NEXT: fmov w8, s3
-; CHECK-GI-NEXT: mov v0.b[4], w8
-; CHECK-GI-NEXT: fmov w8, s4
-; CHECK-GI-NEXT: mov v0.b[5], w8
-; CHECK-GI-NEXT: fmov w8, s5
-; CHECK-GI-NEXT: mov v0.b[6], w8
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: lshr_v7i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: neg v1.8b, v1.8b
+; CHECK-NEXT: ushl v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%3 = lshr <7 x i8> %0, %1
ret <7 x i8> %3
}
@@ -1639,38 +1374,11 @@ define <3 x i16> @lshr_v3i16(<3 x i16> %0, <3 x i16> %1){
}
define <7 x i16> @lshr_v7i16(<7 x i16> %0, <7 x i16> %1){
-; CHECK-SD-LABEL: lshr_v7i16:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: neg v1.8h, v1.8h
-; CHECK-SD-NEXT: ushl v0.8h, v0.8h, v1.8h
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: lshr_v7i16:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov v2.h[0], v1.h[0]
-; CHECK-GI-NEXT: mov v3.h[0], v0.h[0]
-; CHECK-GI-NEXT: mov v2.h[1], v1.h[1]
-; CHECK-GI-NEXT: mov v3.h[1], v0.h[1]
-; CHECK-GI-NEXT: mov v2.h[2], v1.h[2]
-; CHECK-GI-NEXT: mov v3.h[2], v0.h[2]
-; CHECK-GI-NEXT: mov v2.h[3], v1.h[3]
-; CHECK-GI-NEXT: mov v3.h[3], v0.h[3]
-; CHECK-GI-NEXT: mov v2.h[4], v1.h[4]
-; CHECK-GI-NEXT: mov v3.h[4], v0.h[4]
-; CHECK-GI-NEXT: mov v2.h[5], v1.h[5]
-; CHECK-GI-NEXT: mov v3.h[5], v0.h[5]
-; CHECK-GI-NEXT: mov v2.h[6], v1.h[6]
-; CHECK-GI-NEXT: mov v3.h[6], v0.h[6]
-; CHECK-GI-NEXT: neg v0.8h, v2.8h
-; CHECK-GI-NEXT: ushl v1.8h, v3.8h, v0.8h
-; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[1]
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[2]
-; CHECK-GI-NEXT: mov v0.h[3], v1.h[3]
-; CHECK-GI-NEXT: mov v0.h[4], v1.h[4]
-; CHECK-GI-NEXT: mov v0.h[5], v1.h[5]
-; CHECK-GI-NEXT: mov v0.h[6], v1.h[6]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: lshr_v7i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: neg v1.8h, v1.8h
+; CHECK-NEXT: ushl v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
%3 = lshr <7 x i16> %0, %1
ret <7 x i16> %3
}
diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll
index db0fd4293e084b..02142f9b9e71dd 100644
--- a/llvm/test/CodeGen/AArch64/shufflevector.ll
+++ b/llvm/test/CodeGen/AArch64/shufflevector.ll
@@ -544,62 +544,12 @@ define <7 x i8> @shufflevector_v7i8(<7 x i8> %a, <7 x i8> %b) {
; CHECK-GI-LABEL: shufflevector_v7i8:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov b2, v0.b[1]
-; CHECK-GI-NEXT: mov b3, v0.b[2]
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov b4, v0.b[3]
-; CHECK-GI-NEXT: mov b5, v0.b[4]
-; CHECK-GI-NEXT: mov b6, v0.b[5]
-; CHECK-GI-NEXT: mov b7, v1.b[3]
-; CHECK-GI-NEXT: mov b16, v1.b[4]
-; CHECK-GI-NEXT: mov b17, v1.b[5]
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov b2, v0.b[6]
-; CHECK-GI-NEXT: mov v0.h[1], w8
-; CHECK-GI-NEXT: fmov w8, s3
-; CHECK-GI-NEXT: mov b3, v1.b[1]
-; CHECK-GI-NEXT: mov v0.h[2], w8
-; CHECK-GI-NEXT: fmov w8, s4
-; CHECK-GI-NEXT: mov b4, v1.b[2]
-; CHECK-GI-NEXT: fmov w9, s3
-; CHECK-GI-NEXT: mov b3, v1.b[6]
-; CHECK-GI-NEXT: mov v0.h[3], w8
-; CHECK-GI-NEXT: fmov w8, s5
-; CHECK-GI-NEXT: mov v1.h[1], w9
-; CHECK-GI-NEXT: fmov w9, s4
-; CHECK-GI-NEXT: mov v0.h[4], w8
-; CHECK-GI-NEXT: fmov w8, s6
-; CHECK-GI-NEXT: mov v1.h[2], w9
-; CHECK-GI-NEXT: fmov w9, s7
-; CHECK-GI-NEXT: mov v0.h[5], w8
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov v1.h[3], w9
-; CHECK-GI-NEXT: mov v0.h[6], w8
-; CHECK-GI-NEXT: fmov w8, s16
-; CHECK-GI-NEXT: mov v1.h[4], w8
-; CHECK-GI-NEXT: fmov w8, s17
-; CHECK-GI-NEXT: mov h4, v0.h[3]
-; CHECK-GI-NEXT: mov h2, v0.h[1]
-; CHECK-GI-NEXT: mov h0, v0.h[5]
-; CHECK-GI-NEXT: mov v1.h[5], w8
-; CHECK-GI-NEXT: fmov w8, s3
-; CHECK-GI-NEXT: fmov w9, s4
-; CHECK-GI-NEXT: mov v2.b[1], w9
-; CHECK-GI-NEXT: fmov w9, s0
-; CHECK-GI-NEXT: mov v1.h[6], w8
-; CHECK-GI-NEXT: mov v2.b[2], w9
-; CHECK-GI-NEXT: mov h0, v1.h[1]
-; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: mov h3, v1.h[3]
-; CHECK-GI-NEXT: mov v2.b[3], w8
-; CHECK-GI-NEXT: fmov w8, s0
-; CHECK-GI-NEXT: mov h0, v1.h[5]
-; CHECK-GI-NEXT: mov v2.b[4], w8
-; CHECK-GI-NEXT: fmov w8, s3
-; CHECK-GI-NEXT: mov v2.b[5], w8
-; CHECK-GI-NEXT: fmov w8, s0
-; CHECK-GI-NEXT: mov v2.b[6], w8
-; CHECK-GI-NEXT: fmov d0, d2
+; CHECK-GI-NEXT: adrp x8, .LCPI31_0
+; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI31_0]
+; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
%c = shufflevector <7 x i8> %a, <7 x i8> %b, <7 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12>
ret <7 x i8> %c
@@ -645,27 +595,11 @@ define <7 x i16> @shufflevector_v7i16(<7 x i16> %a, <7 x i16> %b) {
;
; CHECK-GI-LABEL: shufflevector_v7i16:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov v2.h[0], v0.h[0]
-; CHECK-GI-NEXT: mov v3.h[0], v1.h[0]
-; CHECK-GI-NEXT: mov v2.h[1], v0.h[1]
-; CHECK-GI-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-GI-NEXT: mov v2.h[2], v0.h[2]
-; CHECK-GI-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-GI-NEXT: mov v2.h[3], v0.h[3]
-; CHECK-GI-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-GI-NEXT: mov v2.h[4], v0.h[4]
-; CHECK-GI-NEXT: mov v3.h[4], v1.h[4]
-; CHECK-GI-NEXT: mov v2.h[5], v0.h[5]
-; CHECK-GI-NEXT: mov v3.h[5], v1.h[5]
-; CHECK-GI-NEXT: mov v2.h[6], v0.h[6]
-; CHECK-GI-NEXT: mov v3.h[6], v1.h[6]
-; CHECK-GI-NEXT: mov v0.h[0], v2.h[1]
-; CHECK-GI-NEXT: mov v0.h[1], v2.h[3]
-; CHECK-GI-NEXT: mov v0.h[2], v2.h[5]
-; CHECK-GI-NEXT: mov v0.h[3], v3.h[0]
-; CHECK-GI-NEXT: mov v0.h[4], v3.h[1]
-; CHECK-GI-NEXT: mov v0.h[5], v3.h[3]
-; CHECK-GI-NEXT: mov v0.h[6], v3.h[5]
+; CHECK-GI-NEXT: adrp x8, .LCPI33_0
+; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI33_0]
+; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-GI-NEXT: ret
%c = shufflevector <7 x i16> %a, <7 x i16> %b, <7 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12>
ret <7 x i16> %c
@@ -714,47 +648,11 @@ define <3 x i8> @shufflevector_v3i8_zeroes(<3 x i8> %a, <3 x i8> %b) {
}
define <7 x i8> @shufflevector_v7i8_zeroes(<7 x i8> %a, <7 x i8> %b) {
-; CHECK-SD-LABEL: shufflevector_v7i8_zeroes:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: dup v0.8b, v0.b[0]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: shufflevector_v7i8_zeroes:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov b1, v0.b[1]
-; CHECK-GI-NEXT: mov b2, v0.b[2]
-; CHECK-GI-NEXT: mov b3, v0.b[3]
-; CHECK-GI-NEXT: mov b4, v0.b[4]
-; CHECK-GI-NEXT: mov b5, v0.b[6]
-; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: mov b1, v0.b[5]
-; CHECK-GI-NEXT: mov v0.h[1], w8
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov v0.h[2], w8
-; CHECK-GI-NEXT: fmov w8, s3
-; CHECK-GI-NEXT: mov v0.h[3], w8
-; CHECK-GI-NEXT: fmov w8, s4
-; CHECK-GI-NEXT: mov v0.h[4], w8
-; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: mov v0.h[5], w8
-; CHECK-GI-NEXT: fmov w8, s5
-; CHECK-GI-NEXT: mov v0.h[6], w8
-; CHECK-GI-NEXT: fmov w8, s0
-; CHECK-GI-NEXT: fmov w9, s0
-; CHECK-GI-NEXT: fmov w10, s0
-; CHECK-GI-NEXT: fmov w11, s0
-; CHECK-GI-NEXT: fmov w12, s0
-; CHECK-GI-NEXT: fmov w13, s0
-; CHECK-GI-NEXT: mov v0.b[1], w8
-; CHECK-GI-NEXT: mov v0.b[2], w9
-; CHECK-GI-NEXT: mov v0.b[3], w10
-; CHECK-GI-NEXT: mov v0.b[4], w11
-; CHECK-GI-NEXT: mov v0.b[5], w12
-; CHECK-GI-NEXT: mov v0.b[6], w13
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: shufflevector_v7i8_zeroes:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: dup v0.8b, v0.b[0]
+; CHECK-NEXT: ret
%c = shufflevector <7 x i8> %a, <7 x i8> %b, <7 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <7 x i8> %c
}
@@ -785,28 +683,10 @@ define <3 x i16> @shufflevector_v3i16_zeroes(<3 x i16> %a, <3 x i16> %b) {
}
define <7 x i16> @shufflevector_v7i16_zeroes(<7 x i16> %a, <7 x i16> %b) {
-; CHECK-SD-LABEL: shufflevector_v7i16_zeroes:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: dup v0.8h, v0.h[0]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: shufflevector_v7i16_zeroes:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
-; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-GI-NEXT: mov v1.h[3], v0.h[3]
-; CHECK-GI-NEXT: mov v1.h[4], v0.h[4]
-; CHECK-GI-NEXT: mov v1.h[5], v0.h[5]
-; CHECK-GI-NEXT: mov v1.h[6], v0.h[6]
-; CHECK-GI-NEXT: mov v0.h[0], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[3], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[5], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[6], v1.h[0]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: shufflevector_v7i16_zeroes:
+; CHECK: // %bb.0:
+; CHECK-NEXT: dup v0.8h, v0.h[0]
+; CHECK-NEXT: ret
%c = shufflevector <7 x i16> %a, <7 x i16> %b, <7 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <7 x i16> %c
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
index 4d400d53916f16..f426fb8954ed26 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
@@ -2115,7 +2115,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %
; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: bb.2:
- ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
+ ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2172,7 +2172,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %
; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %41, %bb.3
+ ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3
; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2237,7 +2237,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %
; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: bb.2:
- ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %43, %bb.3
+ ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.3
; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2294,7 +2294,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %
; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %41, %bb.3
+ ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3
; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2357,7 +2357,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %
; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: bb.2:
- ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %43, %bb.3
+ ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.3
; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2414,7 +2414,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %
; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %41, %bb.3
+ ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3
; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2476,7 +2476,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: bb.2:
- ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
+ ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2534,7 +2534,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
+ ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2596,7 +2596,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: bb.2:
- ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
+ ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2654,7 +2654,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
+ ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2716,7 +2716,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: bb.2:
- ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
+ ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2774,7 +2774,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
+ ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2835,7 +2835,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4
; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: bb.2:
- ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %42, %bb.3
+ ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
@@ -2891,7 +2891,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4
; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %41, %bb.3
+ ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3
; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
More information about the llvm-commits
mailing list