[llvm] 7b6f760 - [ARM] MVE vector lane interleaving
David Green via llvm-commits
llvm-commits at lists.llvm.org
Sun Mar 28 11:35:26 PDT 2021
Author: David Green
Date: 2021-03-28T19:34:58+01:00
New Revision: 7b6f760fcd19c52149a5dea81512bdceb222032a
URL: https://github.com/llvm/llvm-project/commit/7b6f760fcd19c52149a5dea81512bdceb222032a
DIFF: https://github.com/llvm/llvm-project/commit/7b6f760fcd19c52149a5dea81512bdceb222032a.diff
LOG: [ARM] MVE vector lane interleaving
MVE does not have a single sext/zext or trunc instruction that takes the
bottom half of a vector and extends to a full width, like NEON has with
MOVL. Instead it is expected that this happens through top/bottom
instructions. So the MVE equivalent VMOVLT/B instructions take either
the even or odd elements of the input and extend them to the larger
type, producing a vector with half the number of elements each of double
the bitwidth. As there is no simple instruction for a normal extend, we
often have to expand sext/zext/trunc into a series of lane moves (or
stack loads/stores, which we do not do yet).
This pass takes vector code that starts at truncs, looks for
interconnected blobs of operations that end with sext/zext and
transforms them by adding shuffles so that the lanes are interleaved and
the MVE VMOVL/VMOVN instructions can be used. This is done pre-ISel so
that it can work across basic blocks.
This initial version of the pass just handles a limited set of
instructions, not handling constants or splats or FP, which can all come
as extensions to this base.
Differential Revision: https://reviews.llvm.org/D95804
Added:
llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp
Modified:
llvm/lib/Target/ARM/ARM.h
llvm/lib/Target/ARM/ARMTargetMachine.cpp
llvm/lib/Target/ARM/CMakeLists.txt
llvm/test/CodeGen/ARM/O3-pipeline.ll
llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/ARM.h b/llvm/lib/Target/ARM/ARM.h
index 21ce3b19e170..5500783f74db 100644
--- a/llvm/lib/Target/ARM/ARM.h
+++ b/llvm/lib/Target/ARM/ARM.h
@@ -58,6 +58,7 @@ createARMInstructionSelector(const ARMBaseTargetMachine &TM, const ARMSubtarget
Pass *createMVEGatherScatterLoweringPass();
FunctionPass *createARMSLSHardeningPass();
FunctionPass *createARMIndirectThunks();
+Pass *createMVELaneInterleavingPass();
void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
ARMAsmPrinter &AP);
@@ -76,6 +77,7 @@ void initializeARMBlockPlacementPass(PassRegistry &);
void initializeMVETailPredicationPass(PassRegistry &);
void initializeMVEGatherScatterLoweringPass(PassRegistry &);
void initializeARMSLSHardeningPass(PassRegistry &);
+void initializeMVELaneInterleavingPass(PassRegistry &);
} // end namespace llvm
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 23a330f60414..c09df077e257 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -102,6 +102,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTarget() {
initializeARMBlockPlacementPass(Registry);
initializeMVEGatherScatterLoweringPass(Registry);
initializeARMSLSHardeningPass(Registry);
+ initializeMVELaneInterleavingPass(Registry);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -416,6 +417,7 @@ void ARMPassConfig::addIRPasses() {
}));
addPass(createMVEGatherScatterLoweringPass());
+ addPass(createMVELaneInterleavingPass());
TargetPassConfig::addIRPasses();
diff --git a/llvm/lib/Target/ARM/CMakeLists.txt b/llvm/lib/Target/ARM/CMakeLists.txt
index 2b03e9fb3f59..89abc579460b 100644
--- a/llvm/lib/Target/ARM/CMakeLists.txt
+++ b/llvm/lib/Target/ARM/CMakeLists.txt
@@ -56,6 +56,7 @@ add_llvm_target(ARMCodeGen
ARMTargetTransformInfo.cpp
MLxExpansionPass.cpp
MVEGatherScatterLowering.cpp
+ MVELaneInterleavingPass.cpp
MVETailPredication.cpp
MVEVPTBlockPass.cpp
MVETPAndVPTOptimisationsPass.cpp
diff --git a/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp b/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp
new file mode 100644
index 000000000000..c77130b7b2c3
--- /dev/null
+++ b/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp
@@ -0,0 +1,328 @@
+//===- MVELaneInterleaving.cpp - Inverleave for MVE instructions ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass interleaves around sext/zext/trunc instructions. MVE does not have
+// a single sext/zext or trunc instruction that takes the bottom half of a
+// vector and extends to a full width, like NEON has with MOVL. Instead it is
+// expected that this happens through top/bottom instructions. So the MVE
+// equivalent VMOVLT/B instructions take either the even or odd elements of the
+// input and extend them to the larger type, producing a vector with half the
+// number of elements each of double the bitwidth. As there is no simple
+// instruction, we often have to turn sext/zext/trunc into a series of lane
+// moves (or stack loads/stores, which we do not do yet).
+//
+// This pass takes vector code that starts at truncs, looks for interconnected
+// blobs of operations that end with sext/zext (or constants/splats) of the
+// form:
+// %sa = sext v8i16 %a to v8i32
+// %sb = sext v8i16 %b to v8i32
+// %add = add v8i32 %sa, %sb
+// %r = trunc %add to v8i16
+// And adds shuffles to allow the use of VMOVL/VMOVN instrctions:
+// %sha = shuffle v8i16 %a, undef, <0, 2, 4, 6, 1, 3, 5, 7>
+// %sa = sext v8i16 %sha to v8i32
+// %shb = shuffle v8i16 %b, undef, <0, 2, 4, 6, 1, 3, 5, 7>
+// %sb = sext v8i16 %shb to v8i32
+// %add = add v8i32 %sa, %sb
+// %r = trunc %add to v8i16
+// %shr = shuffle v8i16 %r, undef, <0, 4, 1, 5, 2, 6, 3, 7>
+// Which can then be split and lowered to MVE instructions efficiently:
+// %sa_b = VMOVLB.s16 %a
+// %sa_t = VMOVLT.s16 %a
+// %sb_b = VMOVLB.s16 %b
+// %sb_t = VMOVLT.s16 %b
+// %add_b = VADD.i32 %sa_b, %sb_b
+// %add_t = VADD.i32 %sa_t, %sb_t
+// %r = VMOVNT.i16 %add_b, %add_t
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsARM.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include <algorithm>
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mve-laneinterleave"
+
+cl::opt<bool> EnableInterleave(
+ "enable-mve-interleave", cl::Hidden, cl::init(true),
+ cl::desc("Enable interleave MVE vector operation lowering"));
+
+namespace {
+
+class MVELaneInterleaving : public FunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+
+ explicit MVELaneInterleaving() : FunctionPass(ID) {
+ initializeMVELaneInterleavingPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ StringRef getPassName() const override { return "MVE lane interleaving"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<TargetPassConfig>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+char MVELaneInterleaving::ID = 0;
+
+INITIALIZE_PASS(MVELaneInterleaving, DEBUG_TYPE, "MVE lane interleaving", false,
+ false)
+
+Pass *llvm::createMVELaneInterleavingPass() {
+ return new MVELaneInterleaving();
+}
+
+static bool isProfitableToInterleave(SmallSetVector<Instruction *, 4> &Exts,
+ SmallSetVector<Instruction *, 4> &Truncs) {
+ // This is not always beneficial to transform. Exts can be incorporated into
+ // loads, Truncs can be folded into stores.
+ // Truncs are usually the same number of instructions,
+ // VSTRH.32(A);VSTRH.32(B) vs VSTRH.16(VMOVNT A, B) with interleaving
+ // Exts are unfortunately more instructions in the general case:
+ // A=VLDRH.32; B=VLDRH.32;
+ // vs with interleaving:
+ // T=VLDRH.16; A=VMOVNB T; B=VMOVNT T
+ // But those VMOVL may be folded into a VMULL.
+
+ // But expensive extends/truncs are always good to remove.
+ for (auto *E : Exts)
+ if (!isa<LoadInst>(E->getOperand(0))) {
+ LLVM_DEBUG(dbgs() << "Beneficial due to " << *E << "\n");
+ return true;
+ }
+ for (auto *T : Truncs)
+ if (T->hasOneUse() && !isa<StoreInst>(*T->user_begin())) {
+ LLVM_DEBUG(dbgs() << "Beneficial due to " << *T << "\n");
+ return true;
+ }
+
+ // Otherwise, we know we have a load(ext), see if any of the Extends are a
+ // vmull. This is a simple heuristic and certainly not perfect.
+ for (auto *E : Exts) {
+ if (!E->hasOneUse() ||
+ cast<Instruction>(*E->user_begin())->getOpcode() != Instruction::Mul) {
+ LLVM_DEBUG(dbgs() << "Not beneficial due to " << *E << "\n");
+ return false;
+ }
+ }
+ return true;
+}
+
+static bool tryInterleave(Instruction *Start,
+ SmallPtrSetImpl<Instruction *> &Visited) {
+ LLVM_DEBUG(dbgs() << "tryInterleave from " << *Start << "\n");
+ auto *VT = cast<FixedVectorType>(Start->getType());
+
+ if (!isa<Instruction>(Start->getOperand(0)))
+ return false;
+
+ // Look for connected operations starting from Ext's, terminating at Truncs.
+ std::vector<Instruction *> Worklist;
+ Worklist.push_back(Start);
+ Worklist.push_back(cast<Instruction>(Start->getOperand(0)));
+
+ SmallSetVector<Instruction *, 4> Truncs;
+ SmallSetVector<Instruction *, 4> Exts;
+ SmallSetVector<Instruction *, 4> Ops;
+
+ while (!Worklist.empty()) {
+ Instruction *I = Worklist.back();
+ Worklist.pop_back();
+
+ switch (I->getOpcode()) {
+ // Truncs
+ case Instruction::Trunc:
+ if (Truncs.count(I))
+ continue;
+ Truncs.insert(I);
+ Visited.insert(I);
+ break;
+
+ // Extend leafs
+ case Instruction::SExt:
+ case Instruction::ZExt:
+ if (Exts.count(I))
+ continue;
+ for (auto *Use : I->users())
+ Worklist.push_back(cast<Instruction>(Use));
+ Exts.insert(I);
+ break;
+
+ // Binary/tertiary ops
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::AShr:
+ case Instruction::LShr:
+ case Instruction::Shl:
+ case Instruction::ICmp:
+ case Instruction::Select:
+ if (Ops.count(I))
+ continue;
+ Ops.insert(I);
+
+ for (Use &Op : I->operands()) {
+ if (isa<Instruction>(Op))
+ Worklist.push_back(cast<Instruction>(&Op));
+ else
+ return false;
+ }
+
+ for (auto *Use : I->users())
+ Worklist.push_back(cast<Instruction>(Use));
+ break;
+
+ default:
+ LLVM_DEBUG(dbgs() << " Unhandled instruction: " << *I << "\n");
+ return false;
+ }
+ }
+
+ LLVM_DEBUG({
+ dbgs() << "Found group:\n Exts:";
+ for (auto *I : Exts)
+ dbgs() << " " << *I << "\n";
+ dbgs() << " Ops:";
+ for (auto *I : Ops)
+ dbgs() << " " << *I << "\n";
+ dbgs() << "Truncs:";
+ for (auto *I : Truncs)
+ dbgs() << " " << *I << "\n";
+ });
+
+ assert(!Truncs.empty() && "Expected some truncs");
+ assert(!Exts.empty() && "Expected some leaves");
+
+ // Check types
+ unsigned NumElts = VT->getNumElements();
+ unsigned BaseElts = VT->getScalarSizeInBits() == 16
+ ? 8
+ : (VT->getScalarSizeInBits() == 8 ? 16 : 0);
+ if (BaseElts == 0 || NumElts % BaseElts != 0) {
+ LLVM_DEBUG(dbgs() << " Type is unsupported\n");
+ return false;
+ }
+ if (Start->getOperand(0)->getType()->getScalarSizeInBits() !=
+ VT->getScalarSizeInBits() * 2) {
+ LLVM_DEBUG(dbgs() << " Type not double sized\n");
+ return false;
+ }
+ for (Instruction *I : Exts)
+ if (I->getOperand(0)->getType() != VT) {
+ LLVM_DEBUG(dbgs() << " Wrong type on " << *I << "\n");
+ return false;
+ }
+ for (Instruction *I : Truncs)
+ if (I->getType() != VT) {
+ LLVM_DEBUG(dbgs() << " Wrong type on " << *I << "\n");
+ return false;
+ }
+
+ // Check that it looks beneficial
+ if (!isProfitableToInterleave(Exts, Truncs))
+ return false;
+
+ // Create new shuffles around the extends / truncs / other leaves.
+ IRBuilder<> Builder(Start);
+
+ SmallVector<int, 16> LeafMask;
+ SmallVector<int, 16> TruncMask;
+ // LeafMask : 0, 2, 4, 6, 1, 3, 5, 7 8, 10, 12, 14, 9, 11, 13, 15
+ // TruncMask: 0, 4, 1, 5, 2, 6, 3, 7 8, 12, 9, 13, 10, 14, 11, 15
+ for (unsigned Base = 0; Base < NumElts; Base += BaseElts) {
+ for (unsigned i = 0; i < BaseElts / 2; i++)
+ LeafMask.push_back(Base + i * 2);
+ for (unsigned i = 0; i < BaseElts / 2; i++)
+ LeafMask.push_back(Base + i * 2 + 1);
+ }
+ for (unsigned Base = 0; Base < NumElts; Base += BaseElts) {
+ for (unsigned i = 0; i < BaseElts / 2; i++) {
+ TruncMask.push_back(Base + i);
+ TruncMask.push_back(Base + i + BaseElts / 2);
+ }
+ }
+
+ for (Instruction *I : Exts) {
+ LLVM_DEBUG(dbgs() << "Replacing ext " << *I << "\n");
+ Builder.SetInsertPoint(I);
+ Value *Shuffle = Builder.CreateShuffleVector(I->getOperand(0), LeafMask);
+ bool Sext = isa<SExtInst>(I);
+ Value *Ext = Sext ? Builder.CreateSExt(Shuffle, I->getType())
+ : Builder.CreateZExt(Shuffle, I->getType());
+ I->replaceAllUsesWith(Ext);
+ LLVM_DEBUG(dbgs() << " with " << *Shuffle << "\n");
+ }
+
+ for (Instruction *I : Truncs) {
+ LLVM_DEBUG(dbgs() << "Replacing trunc " << *I << "\n");
+
+ Builder.SetInsertPoint(I->getParent(), ++I->getIterator());
+ Value *Shuf = Builder.CreateShuffleVector(I, TruncMask);
+ I->replaceAllUsesWith(Shuf);
+ cast<Instruction>(Shuf)->setOperand(0, I);
+
+ LLVM_DEBUG(dbgs() << " with " << *Shuf << "\n");
+ }
+
+ return false;
+}
+
+bool MVELaneInterleaving::runOnFunction(Function &F) {
+ if (!EnableInterleave)
+ return false;
+ auto &TPC = getAnalysis<TargetPassConfig>();
+ auto &TM = TPC.getTM<TargetMachine>();
+ auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
+ if (!ST->hasMVEIntegerOps())
+ return false;
+
+ bool Changed = false;
+
+ SmallPtrSet<Instruction *, 16> Visited;
+ for (Instruction &I : reverse(instructions(F))) {
+ if (I.getType()->isVectorTy() &&
+ (isa<TruncInst>(I) || isa<FPTruncInst>(I)) && !Visited.count(&I))
+ Changed |= tryInterleave(&I, Visited);
+ }
+
+ return Changed;
+}
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
index 384d86dbc828..118beab8d295 100644
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -10,6 +10,7 @@
; CHECK-NEXT: Dominator Tree Construction
; CHECK-NEXT: Natural Loop Information
; CHECK-NEXT: MVE gather/scatter lowering
+; CHECK-NEXT: MVE lane interleaving
; CHECK-NEXT: Module Verifier
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
; CHECK-NEXT: Canonicalize natural loops
diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
index f894f2acd188..883a0781f313 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
@@ -76,34 +76,22 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @loads_i16(<8 x i16> *%A, <8 x i16> *%B, <8 x i16> *%C) {
; CHECK-LABEL: loads_i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrh.s32 q0, [r1]
-; CHECK-NEXT: vldrh.s32 q1, [r0]
-; CHECK-NEXT: vldrh.s32 q2, [r0, #8]
-; CHECK-NEXT: vadd.i32 q0, q1, q0
-; CHECK-NEXT: vldrh.u32 q1, [r2]
-; CHECK-NEXT: vneg.s32 q1, q1
-; CHECK-NEXT: vshl.s32 q1, q0, q1
-; CHECK-NEXT: vmov r3, s4
-; CHECK-NEXT: vmov.16 q0[0], r3
-; CHECK-NEXT: vmov r3, s5
-; CHECK-NEXT: vmov.16 q0[1], r3
-; CHECK-NEXT: vmov r3, s6
-; CHECK-NEXT: vmov.16 q0[2], r3
-; CHECK-NEXT: vmov r3, s7
-; CHECK-NEXT: vldrh.s32 q1, [r1, #8]
-; CHECK-NEXT: vmov.16 q0[3], r3
-; CHECK-NEXT: vadd.i32 q1, q2, q1
-; CHECK-NEXT: vldrh.u32 q2, [r2, #8]
-; CHECK-NEXT: vneg.s32 q2, q2
-; CHECK-NEXT: vshl.s32 q1, q1, q2
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: vmov.16 q0[4], r0
-; CHECK-NEXT: vmov r0, s5
-; CHECK-NEXT: vmov.16 q0[5], r0
-; CHECK-NEXT: vmov r0, s6
-; CHECK-NEXT: vmov.16 q0[6], r0
-; CHECK-NEXT: vmov r0, s7
-; CHECK-NEXT: vmov.16 q0[7], r0
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vldrw.u32 q2, [r0]
+; CHECK-NEXT: vmovlb.s16 q1, q0
+; CHECK-NEXT: vmovlb.s16 q3, q2
+; CHECK-NEXT: vmovlt.s16 q0, q0
+; CHECK-NEXT: vmovlt.s16 q2, q2
+; CHECK-NEXT: vadd.i32 q0, q2, q0
+; CHECK-NEXT: vldrw.u32 q2, [r2]
+; CHECK-NEXT: vadd.i32 q1, q3, q1
+; CHECK-NEXT: vmovlt.u16 q3, q2
+; CHECK-NEXT: vneg.s32 q3, q3
+; CHECK-NEXT: vshl.s32 q3, q0, q3
+; CHECK-NEXT: vmovlb.u16 q0, q2
+; CHECK-NEXT: vneg.s32 q0, q0
+; CHECK-NEXT: vshl.s32 q0, q1, q0
+; CHECK-NEXT: vmovnt.i32 q0, q3
; CHECK-NEXT: bx lr
entry:
%a = load <8 x i16>, <8 x i16> *%A, align 4
@@ -121,50 +109,22 @@ entry:
define arm_aapcs_vfpcc <16 x i8> @loads_i8(<16 x i8> *%A, <16 x i8> *%B, <16 x i8> *%C) {
; CHECK-LABEL: loads_i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrb.s16 q0, [r1]
-; CHECK-NEXT: vldrb.s16 q1, [r0]
-; CHECK-NEXT: vldrb.s16 q2, [r0, #8]
-; CHECK-NEXT: vadd.i16 q0, q1, q0
-; CHECK-NEXT: vldrb.u16 q1, [r2]
-; CHECK-NEXT: vneg.s16 q1, q1
-; CHECK-NEXT: vshl.s16 q1, q0, q1
-; CHECK-NEXT: vmov.u16 r3, q1[0]
-; CHECK-NEXT: vmov.8 q0[0], r3
-; CHECK-NEXT: vmov.u16 r3, q1[1]
-; CHECK-NEXT: vmov.8 q0[1], r3
-; CHECK-NEXT: vmov.u16 r3, q1[2]
-; CHECK-NEXT: vmov.8 q0[2], r3
-; CHECK-NEXT: vmov.u16 r3, q1[3]
-; CHECK-NEXT: vmov.8 q0[3], r3
-; CHECK-NEXT: vmov.u16 r3, q1[4]
-; CHECK-NEXT: vmov.8 q0[4], r3
-; CHECK-NEXT: vmov.u16 r3, q1[5]
-; CHECK-NEXT: vmov.8 q0[5], r3
-; CHECK-NEXT: vmov.u16 r3, q1[6]
-; CHECK-NEXT: vmov.8 q0[6], r3
-; CHECK-NEXT: vmov.u16 r3, q1[7]
-; CHECK-NEXT: vldrb.s16 q1, [r1, #8]
-; CHECK-NEXT: vmov.8 q0[7], r3
-; CHECK-NEXT: vadd.i16 q1, q2, q1
-; CHECK-NEXT: vldrb.u16 q2, [r2, #8]
-; CHECK-NEXT: vneg.s16 q2, q2
-; CHECK-NEXT: vshl.s16 q1, q1, q2
-; CHECK-NEXT: vmov.u16 r0, q1[0]
-; CHECK-NEXT: vmov.8 q0[8], r0
-; CHECK-NEXT: vmov.u16 r0, q1[1]
-; CHECK-NEXT: vmov.8 q0[9], r0
-; CHECK-NEXT: vmov.u16 r0, q1[2]
-; CHECK-NEXT: vmov.8 q0[10], r0
-; CHECK-NEXT: vmov.u16 r0, q1[3]
-; CHECK-NEXT: vmov.8 q0[11], r0
-; CHECK-NEXT: vmov.u16 r0, q1[4]
-; CHECK-NEXT: vmov.8 q0[12], r0
-; CHECK-NEXT: vmov.u16 r0, q1[5]
-; CHECK-NEXT: vmov.8 q0[13], r0
-; CHECK-NEXT: vmov.u16 r0, q1[6]
-; CHECK-NEXT: vmov.8 q0[14], r0
-; CHECK-NEXT: vmov.u16 r0, q1[7]
-; CHECK-NEXT: vmov.8 q0[15], r0
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vldrw.u32 q2, [r0]
+; CHECK-NEXT: vmovlb.s8 q1, q0
+; CHECK-NEXT: vmovlb.s8 q3, q2
+; CHECK-NEXT: vmovlt.s8 q0, q0
+; CHECK-NEXT: vmovlt.s8 q2, q2
+; CHECK-NEXT: vadd.i16 q0, q2, q0
+; CHECK-NEXT: vldrw.u32 q2, [r2]
+; CHECK-NEXT: vadd.i16 q1, q3, q1
+; CHECK-NEXT: vmovlt.u8 q3, q2
+; CHECK-NEXT: vneg.s16 q3, q3
+; CHECK-NEXT: vshl.s16 q3, q0, q3
+; CHECK-NEXT: vmovlb.u8 q0, q2
+; CHECK-NEXT: vneg.s16 q0, q0
+; CHECK-NEXT: vshl.s16 q0, q1, q0
+; CHECK-NEXT: vmovnt.i16 q0, q3
; CHECK-NEXT: bx lr
entry:
%a = load <16 x i8>, <16 x i8> *%A, align 4
diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
index c14efc566aed..c8f0a7a85873 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
@@ -96,52 +96,11 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @ext_add_trunc_v8i16(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: ext_add_trunc_v8i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9}
-; CHECK-NEXT: vpush {d8, d9}
-; CHECK-NEXT: vmov.u16 r0, q1[2]
-; CHECK-NEXT: vmov.u16 r1, q1[0]
-; CHECK-NEXT: vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT: vmov.u16 r0, q1[3]
-; CHECK-NEXT: vmov.u16 r1, q1[1]
-; CHECK-NEXT: vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT: vmov.u16 r0, q0[2]
-; CHECK-NEXT: vmov.u16 r1, q0[0]
-; CHECK-NEXT: vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT: vmov.u16 r0, q0[3]
-; CHECK-NEXT: vmov.u16 r1, q0[1]
-; CHECK-NEXT: vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT: vmov.u16 r1, q0[4]
-; CHECK-NEXT: vadd.i32 q3, q3, q2
-; CHECK-NEXT: vmov r0, s12
-; CHECK-NEXT: vmov.16 q2[0], r0
-; CHECK-NEXT: vmov r0, s13
-; CHECK-NEXT: vmov.16 q2[1], r0
-; CHECK-NEXT: vmov r0, s14
-; CHECK-NEXT: vmov.16 q2[2], r0
-; CHECK-NEXT: vmov r0, s15
-; CHECK-NEXT: vmov.16 q2[3], r0
-; CHECK-NEXT: vmov.u16 r0, q0[6]
-; CHECK-NEXT: vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT: vmov.u16 r0, q1[6]
-; CHECK-NEXT: vmov.u16 r1, q1[4]
-; CHECK-NEXT: vmov q4[2], q4[0], r1, r0
-; CHECK-NEXT: vmov.u16 r0, q1[7]
-; CHECK-NEXT: vmov.u16 r1, q1[5]
-; CHECK-NEXT: vmov q4[3], q4[1], r1, r0
-; CHECK-NEXT: vmov.u16 r0, q0[7]
-; CHECK-NEXT: vmov.u16 r1, q0[5]
-; CHECK-NEXT: vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT: vadd.i32 q0, q3, q4
-; CHECK-NEXT: vmov r0, s0
-; CHECK-NEXT: vmov.16 q2[4], r0
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov.16 q2[5], r0
-; CHECK-NEXT: vmov r0, s2
-; CHECK-NEXT: vmov.16 q2[6], r0
-; CHECK-NEXT: vmov r0, s3
-; CHECK-NEXT: vmov.16 q2[7], r0
-; CHECK-NEXT: vmov q0, q2
-; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: vrev32.16 q3, q0
+; CHECK-NEXT: vrev32.16 q2, q1
+; CHECK-NEXT: vadd.i32 q2, q3, q2
+; CHECK-NEXT: vadd.i32 q0, q0, q1
+; CHECK-NEXT: vmovnt.i32 q0, q2
; CHECK-NEXT: bx lr
entry:
%sa = sext <8 x i16> %a to <8 x i32>
@@ -154,108 +113,11 @@ entry:
define arm_aapcs_vfpcc <16 x i8> @ext_add_trunc_v16i8(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: ext_add_trunc_v16i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9}
-; CHECK-NEXT: vpush {d8, d9}
-; CHECK-NEXT: vmov.u8 r0, q1[0]
-; CHECK-NEXT: vmov.16 q2[0], r0
-; CHECK-NEXT: vmov.u8 r0, q1[1]
-; CHECK-NEXT: vmov.16 q2[1], r0
-; CHECK-NEXT: vmov.u8 r0, q1[2]
-; CHECK-NEXT: vmov.16 q2[2], r0
-; CHECK-NEXT: vmov.u8 r0, q1[3]
-; CHECK-NEXT: vmov.16 q2[3], r0
-; CHECK-NEXT: vmov.u8 r0, q1[4]
-; CHECK-NEXT: vmov.16 q2[4], r0
-; CHECK-NEXT: vmov.u8 r0, q1[5]
-; CHECK-NEXT: vmov.16 q2[5], r0
-; CHECK-NEXT: vmov.u8 r0, q1[6]
-; CHECK-NEXT: vmov.16 q2[6], r0
-; CHECK-NEXT: vmov.u8 r0, q1[7]
-; CHECK-NEXT: vmov.16 q2[7], r0
-; CHECK-NEXT: vmov.u8 r0, q0[0]
-; CHECK-NEXT: vmov.16 q3[0], r0
-; CHECK-NEXT: vmov.u8 r0, q0[1]
-; CHECK-NEXT: vmov.16 q3[1], r0
-; CHECK-NEXT: vmov.u8 r0, q0[2]
-; CHECK-NEXT: vmov.16 q3[2], r0
-; CHECK-NEXT: vmov.u8 r0, q0[3]
-; CHECK-NEXT: vmov.16 q3[3], r0
-; CHECK-NEXT: vmov.u8 r0, q0[4]
-; CHECK-NEXT: vmov.16 q3[4], r0
-; CHECK-NEXT: vmov.u8 r0, q0[5]
-; CHECK-NEXT: vmov.16 q3[5], r0
-; CHECK-NEXT: vmov.u8 r0, q0[6]
-; CHECK-NEXT: vmov.16 q3[6], r0
-; CHECK-NEXT: vmov.u8 r0, q0[7]
-; CHECK-NEXT: vmov.16 q3[7], r0
-; CHECK-NEXT: vadd.i16 q3, q3, q2
-; CHECK-NEXT: vmov.u16 r0, q3[0]
-; CHECK-NEXT: vmov.8 q2[0], r0
-; CHECK-NEXT: vmov.u16 r0, q3[1]
-; CHECK-NEXT: vmov.8 q2[1], r0
-; CHECK-NEXT: vmov.u16 r0, q3[2]
-; CHECK-NEXT: vmov.8 q2[2], r0
-; CHECK-NEXT: vmov.u16 r0, q3[3]
-; CHECK-NEXT: vmov.8 q2[3], r0
-; CHECK-NEXT: vmov.u16 r0, q3[4]
-; CHECK-NEXT: vmov.8 q2[4], r0
-; CHECK-NEXT: vmov.u16 r0, q3[5]
-; CHECK-NEXT: vmov.8 q2[5], r0
-; CHECK-NEXT: vmov.u16 r0, q3[6]
-; CHECK-NEXT: vmov.8 q2[6], r0
-; CHECK-NEXT: vmov.u16 r0, q3[7]
-; CHECK-NEXT: vmov.8 q2[7], r0
-; CHECK-NEXT: vmov.u8 r0, q0[8]
-; CHECK-NEXT: vmov.16 q3[0], r0
-; CHECK-NEXT: vmov.u8 r0, q0[9]
-; CHECK-NEXT: vmov.16 q3[1], r0
-; CHECK-NEXT: vmov.u8 r0, q0[10]
-; CHECK-NEXT: vmov.16 q3[2], r0
-; CHECK-NEXT: vmov.u8 r0, q0[11]
-; CHECK-NEXT: vmov.16 q3[3], r0
-; CHECK-NEXT: vmov.u8 r0, q0[12]
-; CHECK-NEXT: vmov.16 q3[4], r0
-; CHECK-NEXT: vmov.u8 r0, q0[13]
-; CHECK-NEXT: vmov.16 q3[5], r0
-; CHECK-NEXT: vmov.u8 r0, q0[14]
-; CHECK-NEXT: vmov.16 q3[6], r0
-; CHECK-NEXT: vmov.u8 r0, q1[8]
-; CHECK-NEXT: vmov.16 q4[0], r0
-; CHECK-NEXT: vmov.u8 r0, q1[9]
-; CHECK-NEXT: vmov.16 q4[1], r0
-; CHECK-NEXT: vmov.u8 r0, q1[10]
-; CHECK-NEXT: vmov.16 q4[2], r0
-; CHECK-NEXT: vmov.u8 r0, q1[11]
-; CHECK-NEXT: vmov.16 q4[3], r0
-; CHECK-NEXT: vmov.u8 r0, q1[12]
-; CHECK-NEXT: vmov.16 q4[4], r0
-; CHECK-NEXT: vmov.u8 r0, q1[13]
-; CHECK-NEXT: vmov.16 q4[5], r0
-; CHECK-NEXT: vmov.u8 r0, q1[14]
-; CHECK-NEXT: vmov.16 q4[6], r0
-; CHECK-NEXT: vmov.u8 r0, q1[15]
-; CHECK-NEXT: vmov.16 q4[7], r0
-; CHECK-NEXT: vmov.u8 r0, q0[15]
-; CHECK-NEXT: vmov.16 q3[7], r0
-; CHECK-NEXT: vadd.i16 q0, q3, q4
-; CHECK-NEXT: vmov.u16 r0, q0[0]
-; CHECK-NEXT: vmov.8 q2[8], r0
-; CHECK-NEXT: vmov.u16 r0, q0[1]
-; CHECK-NEXT: vmov.8 q2[9], r0
-; CHECK-NEXT: vmov.u16 r0, q0[2]
-; CHECK-NEXT: vmov.8 q2[10], r0
-; CHECK-NEXT: vmov.u16 r0, q0[3]
-; CHECK-NEXT: vmov.8 q2[11], r0
-; CHECK-NEXT: vmov.u16 r0, q0[4]
-; CHECK-NEXT: vmov.8 q2[12], r0
-; CHECK-NEXT: vmov.u16 r0, q0[5]
-; CHECK-NEXT: vmov.8 q2[13], r0
-; CHECK-NEXT: vmov.u16 r0, q0[6]
-; CHECK-NEXT: vmov.8 q2[14], r0
-; CHECK-NEXT: vmov.u16 r0, q0[7]
-; CHECK-NEXT: vmov.8 q2[15], r0
-; CHECK-NEXT: vmov q0, q2
-; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: vrev16.8 q3, q0
+; CHECK-NEXT: vrev16.8 q2, q1
+; CHECK-NEXT: vadd.i16 q2, q3, q2
+; CHECK-NEXT: vadd.i16 q0, q0, q1
+; CHECK-NEXT: vmovnt.i16 q0, q2
; CHECK-NEXT: bx lr
entry:
%sa = sext <16 x i8> %a to <16 x i16>
@@ -268,95 +130,19 @@ entry:
define arm_aapcs_vfpcc <16 x i16> @ext_add_trunc_v16i16(<16 x i16> %a, <16 x i16> %b) {
; CHECK-LABEL: ext_add_trunc_v16i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: vmov.u16 r0, q2[2]
-; CHECK-NEXT: vmov.u16 r1, q2[0]
-; CHECK-NEXT: vmov q4, q0
-; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT: vmov.u16 r0, q2[3]
-; CHECK-NEXT: vmov.u16 r1, q2[1]
-; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT: vmov.u16 r0, q4[2]
-; CHECK-NEXT: vmov.u16 r1, q4[0]
-; CHECK-NEXT: vmov q5[2], q5[0], r1, r0
-; CHECK-NEXT: vmov.u16 r0, q4[3]
-; CHECK-NEXT: vmov.u16 r1, q4[1]
-; CHECK-NEXT: vmov q5[3], q5[1], r1, r0
-; CHECK-NEXT: vmov.u16 r1, q4[4]
-; CHECK-NEXT: vadd.i32 q5, q5, q0
-; CHECK-NEXT: vmov r0, s20
-; CHECK-NEXT: vmov.16 q0[0], r0
-; CHECK-NEXT: vmov r0, s21
-; CHECK-NEXT: vmov.16 q0[1], r0
-; CHECK-NEXT: vmov r0, s22
-; CHECK-NEXT: vmov.16 q0[2], r0
-; CHECK-NEXT: vmov r0, s23
-; CHECK-NEXT: vmov.16 q0[3], r0
-; CHECK-NEXT: vmov.u16 r0, q4[6]
-; CHECK-NEXT: vmov q5[2], q5[0], r1, r0
-; CHECK-NEXT: vmov.u16 r0, q2[6]
-; CHECK-NEXT: vmov.u16 r1, q2[4]
-; CHECK-NEXT: vmov q6[2], q6[0], r1, r0
-; CHECK-NEXT: vmov.u16 r0, q2[7]
-; CHECK-NEXT: vmov.u16 r1, q2[5]
-; CHECK-NEXT: vmov q6[3], q6[1], r1, r0
-; CHECK-NEXT: vmov.u16 r0, q4[7]
-; CHECK-NEXT: vmov.u16 r1, q4[5]
-; CHECK-NEXT: vmov q5[3], q5[1], r1, r0
-; CHECK-NEXT: vmov.u16 r1, q3[0]
-; CHECK-NEXT: vadd.i32 q2, q5, q6
-; CHECK-NEXT: vmov r0, s8
-; CHECK-NEXT: vmov.16 q0[4], r0
-; CHECK-NEXT: vmov r0, s9
-; CHECK-NEXT: vmov.16 q0[5], r0
-; CHECK-NEXT: vmov r0, s10
-; CHECK-NEXT: vmov.16 q0[6], r0
-; CHECK-NEXT: vmov r0, s11
-; CHECK-NEXT: vmov.16 q0[7], r0
-; CHECK-NEXT: vmov.u16 r0, q3[2]
-; CHECK-NEXT: vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT: vmov.u16 r0, q3[3]
-; CHECK-NEXT: vmov.u16 r1, q3[1]
-; CHECK-NEXT: vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT: vmov.u16 r0, q1[2]
-; CHECK-NEXT: vmov.u16 r1, q1[0]
-; CHECK-NEXT: vmov q4[2], q4[0], r1, r0
-; CHECK-NEXT: vmov.u16 r0, q1[3]
-; CHECK-NEXT: vmov.u16 r1, q1[1]
-; CHECK-NEXT: vmov q4[3], q4[1], r1, r0
-; CHECK-NEXT: vmov.u16 r1, q1[4]
-; CHECK-NEXT: vadd.i32 q4, q4, q2
-; CHECK-NEXT: vmov r0, s16
-; CHECK-NEXT: vmov.16 q2[0], r0
-; CHECK-NEXT: vmov r0, s17
-; CHECK-NEXT: vmov.16 q2[1], r0
-; CHECK-NEXT: vmov r0, s18
-; CHECK-NEXT: vmov.16 q2[2], r0
-; CHECK-NEXT: vmov r0, s19
-; CHECK-NEXT: vmov.16 q2[3], r0
-; CHECK-NEXT: vmov.u16 r0, q1[6]
-; CHECK-NEXT: vmov q4[2], q4[0], r1, r0
-; CHECK-NEXT: vmov.u16 r0, q3[6]
-; CHECK-NEXT: vmov.u16 r1, q3[4]
-; CHECK-NEXT: vmov q5[2], q5[0], r1, r0
-; CHECK-NEXT: vmov.u16 r0, q3[7]
-; CHECK-NEXT: vmov.u16 r1, q3[5]
-; CHECK-NEXT: vmov q5[3], q5[1], r1, r0
-; CHECK-NEXT: vmov.u16 r0, q1[7]
-; CHECK-NEXT: vmov.u16 r1, q1[5]
-; CHECK-NEXT: vmov q4[3], q4[1], r1, r0
-; CHECK-NEXT: vadd.i32 q1, q4, q5
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: vmov.16 q2[4], r0
-; CHECK-NEXT: vmov r0, s5
-; CHECK-NEXT: vmov.16 q2[5], r0
-; CHECK-NEXT: vmov r0, s6
-; CHECK-NEXT: vmov.16 q2[6], r0
-; CHECK-NEXT: vmov r0, s7
-; CHECK-NEXT: vmov.16 q2[7], r0
-; CHECK-NEXT: vmov q1, q2
-; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: .vsave {d8, d9, d10, d11}
+; CHECK-NEXT: vpush {d8, d9, d10, d11}
+; CHECK-NEXT: vrev32.16 q5, q0
+; CHECK-NEXT: vrev32.16 q4, q2
+; CHECK-NEXT: vadd.i32 q0, q0, q2
+; CHECK-NEXT: vadd.i32 q4, q5, q4
+; CHECK-NEXT: vmovnt.i32 q0, q4
+; CHECK-NEXT: vrev32.16 q4, q1
+; CHECK-NEXT: vrev32.16 q2, q3
+; CHECK-NEXT: vadd.i32 q1, q1, q3
+; CHECK-NEXT: vadd.i32 q2, q4, q2
+; CHECK-NEXT: vmovnt.i32 q1, q2
+; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: bx lr
entry:
%sa = sext <16 x i16> %a to <16 x i32>
@@ -369,207 +155,19 @@ entry:
define arm_aapcs_vfpcc <32 x i8> @ext_add_trunc_v32i8(<32 x i8> %a, <32 x i8> %b) {
; CHECK-LABEL: ext_add_trunc_v32i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: vmov.u8 r0, q2[0]
-; CHECK-NEXT: vmov q4, q0
-; CHECK-NEXT: vmov.16 q0[0], r0
-; CHECK-NEXT: vmov.u8 r0, q2[1]
-; CHECK-NEXT: vmov.16 q0[1], r0
-; CHECK-NEXT: vmov.u8 r0, q2[2]
-; CHECK-NEXT: vmov.16 q0[2], r0
-; CHECK-NEXT: vmov.u8 r0, q2[3]
-; CHECK-NEXT: vmov.16 q0[3], r0
-; CHECK-NEXT: vmov.u8 r0, q2[4]
-; CHECK-NEXT: vmov.16 q0[4], r0
-; CHECK-NEXT: vmov.u8 r0, q2[5]
-; CHECK-NEXT: vmov.16 q0[5], r0
-; CHECK-NEXT: vmov.u8 r0, q2[6]
-; CHECK-NEXT: vmov.16 q0[6], r0
-; CHECK-NEXT: vmov.u8 r0, q2[7]
-; CHECK-NEXT: vmov.16 q0[7], r0
-; CHECK-NEXT: vmov.u8 r0, q4[0]
-; CHECK-NEXT: vmov.16 q5[0], r0
-; CHECK-NEXT: vmov.u8 r0, q4[1]
-; CHECK-NEXT: vmov.16 q5[1], r0
-; CHECK-NEXT: vmov.u8 r0, q4[2]
-; CHECK-NEXT: vmov.16 q5[2], r0
-; CHECK-NEXT: vmov.u8 r0, q4[3]
-; CHECK-NEXT: vmov.16 q5[3], r0
-; CHECK-NEXT: vmov.u8 r0, q4[4]
-; CHECK-NEXT: vmov.16 q5[4], r0
-; CHECK-NEXT: vmov.u8 r0, q4[5]
-; CHECK-NEXT: vmov.16 q5[5], r0
-; CHECK-NEXT: vmov.u8 r0, q4[6]
-; CHECK-NEXT: vmov.16 q5[6], r0
-; CHECK-NEXT: vmov.u8 r0, q4[7]
-; CHECK-NEXT: vmov.16 q5[7], r0
-; CHECK-NEXT: vadd.i16 q5, q5, q0
-; CHECK-NEXT: vmov.u16 r0, q5[0]
-; CHECK-NEXT: vmov.8 q0[0], r0
-; CHECK-NEXT: vmov.u16 r0, q5[1]
-; CHECK-NEXT: vmov.8 q0[1], r0
-; CHECK-NEXT: vmov.u16 r0, q5[2]
-; CHECK-NEXT: vmov.8 q0[2], r0
-; CHECK-NEXT: vmov.u16 r0, q5[3]
-; CHECK-NEXT: vmov.8 q0[3], r0
-; CHECK-NEXT: vmov.u16 r0, q5[4]
-; CHECK-NEXT: vmov.8 q0[4], r0
-; CHECK-NEXT: vmov.u16 r0, q5[5]
-; CHECK-NEXT: vmov.8 q0[5], r0
-; CHECK-NEXT: vmov.u16 r0, q5[6]
-; CHECK-NEXT: vmov.8 q0[6], r0
-; CHECK-NEXT: vmov.u16 r0, q5[7]
-; CHECK-NEXT: vmov.8 q0[7], r0
-; CHECK-NEXT: vmov.u8 r0, q4[8]
-; CHECK-NEXT: vmov.16 q5[0], r0
-; CHECK-NEXT: vmov.u8 r0, q4[9]
-; CHECK-NEXT: vmov.16 q5[1], r0
-; CHECK-NEXT: vmov.u8 r0, q4[10]
-; CHECK-NEXT: vmov.16 q5[2], r0
-; CHECK-NEXT: vmov.u8 r0, q4[11]
-; CHECK-NEXT: vmov.16 q5[3], r0
-; CHECK-NEXT: vmov.u8 r0, q4[12]
-; CHECK-NEXT: vmov.16 q5[4], r0
-; CHECK-NEXT: vmov.u8 r0, q4[13]
-; CHECK-NEXT: vmov.16 q5[5], r0
-; CHECK-NEXT: vmov.u8 r0, q4[14]
-; CHECK-NEXT: vmov.16 q5[6], r0
-; CHECK-NEXT: vmov.u8 r0, q2[8]
-; CHECK-NEXT: vmov.16 q6[0], r0
-; CHECK-NEXT: vmov.u8 r0, q2[9]
-; CHECK-NEXT: vmov.16 q6[1], r0
-; CHECK-NEXT: vmov.u8 r0, q2[10]
-; CHECK-NEXT: vmov.16 q6[2], r0
-; CHECK-NEXT: vmov.u8 r0, q2[11]
-; CHECK-NEXT: vmov.16 q6[3], r0
-; CHECK-NEXT: vmov.u8 r0, q2[12]
-; CHECK-NEXT: vmov.16 q6[4], r0
-; CHECK-NEXT: vmov.u8 r0, q2[13]
-; CHECK-NEXT: vmov.16 q6[5], r0
-; CHECK-NEXT: vmov.u8 r0, q2[14]
-; CHECK-NEXT: vmov.16 q6[6], r0
-; CHECK-NEXT: vmov.u8 r0, q2[15]
-; CHECK-NEXT: vmov.16 q6[7], r0
-; CHECK-NEXT: vmov.u8 r0, q4[15]
-; CHECK-NEXT: vmov.16 q5[7], r0
-; CHECK-NEXT: vadd.i16 q2, q5, q6
-; CHECK-NEXT: vmov.u16 r0, q2[0]
-; CHECK-NEXT: vmov.8 q0[8], r0
-; CHECK-NEXT: vmov.u16 r0, q2[1]
-; CHECK-NEXT: vmov.8 q0[9], r0
-; CHECK-NEXT: vmov.u16 r0, q2[2]
-; CHECK-NEXT: vmov.8 q0[10], r0
-; CHECK-NEXT: vmov.u16 r0, q2[3]
-; CHECK-NEXT: vmov.8 q0[11], r0
-; CHECK-NEXT: vmov.u16 r0, q2[4]
-; CHECK-NEXT: vmov.8 q0[12], r0
-; CHECK-NEXT: vmov.u16 r0, q2[5]
-; CHECK-NEXT: vmov.8 q0[13], r0
-; CHECK-NEXT: vmov.u16 r0, q2[6]
-; CHECK-NEXT: vmov.8 q0[14], r0
-; CHECK-NEXT: vmov.u16 r0, q2[7]
-; CHECK-NEXT: vmov.8 q0[15], r0
-; CHECK-NEXT: vmov.u8 r0, q3[0]
-; CHECK-NEXT: vmov.16 q2[0], r0
-; CHECK-NEXT: vmov.u8 r0, q3[1]
-; CHECK-NEXT: vmov.16 q2[1], r0
-; CHECK-NEXT: vmov.u8 r0, q3[2]
-; CHECK-NEXT: vmov.16 q2[2], r0
-; CHECK-NEXT: vmov.u8 r0, q3[3]
-; CHECK-NEXT: vmov.16 q2[3], r0
-; CHECK-NEXT: vmov.u8 r0, q3[4]
-; CHECK-NEXT: vmov.16 q2[4], r0
-; CHECK-NEXT: vmov.u8 r0, q3[5]
-; CHECK-NEXT: vmov.16 q2[5], r0
-; CHECK-NEXT: vmov.u8 r0, q3[6]
-; CHECK-NEXT: vmov.16 q2[6], r0
-; CHECK-NEXT: vmov.u8 r0, q3[7]
-; CHECK-NEXT: vmov.16 q2[7], r0
-; CHECK-NEXT: vmov.u8 r0, q1[0]
-; CHECK-NEXT: vmov.16 q4[0], r0
-; CHECK-NEXT: vmov.u8 r0, q1[1]
-; CHECK-NEXT: vmov.16 q4[1], r0
-; CHECK-NEXT: vmov.u8 r0, q1[2]
-; CHECK-NEXT: vmov.16 q4[2], r0
-; CHECK-NEXT: vmov.u8 r0, q1[3]
-; CHECK-NEXT: vmov.16 q4[3], r0
-; CHECK-NEXT: vmov.u8 r0, q1[4]
-; CHECK-NEXT: vmov.16 q4[4], r0
-; CHECK-NEXT: vmov.u8 r0, q1[5]
-; CHECK-NEXT: vmov.16 q4[5], r0
-; CHECK-NEXT: vmov.u8 r0, q1[6]
-; CHECK-NEXT: vmov.16 q4[6], r0
-; CHECK-NEXT: vmov.u8 r0, q1[7]
-; CHECK-NEXT: vmov.16 q4[7], r0
-; CHECK-NEXT: vadd.i16 q4, q4, q2
-; CHECK-NEXT: vmov.u16 r0, q4[0]
-; CHECK-NEXT: vmov.8 q2[0], r0
-; CHECK-NEXT: vmov.u16 r0, q4[1]
-; CHECK-NEXT: vmov.8 q2[1], r0
-; CHECK-NEXT: vmov.u16 r0, q4[2]
-; CHECK-NEXT: vmov.8 q2[2], r0
-; CHECK-NEXT: vmov.u16 r0, q4[3]
-; CHECK-NEXT: vmov.8 q2[3], r0
-; CHECK-NEXT: vmov.u16 r0, q4[4]
-; CHECK-NEXT: vmov.8 q2[4], r0
-; CHECK-NEXT: vmov.u16 r0, q4[5]
-; CHECK-NEXT: vmov.8 q2[5], r0
-; CHECK-NEXT: vmov.u16 r0, q4[6]
-; CHECK-NEXT: vmov.8 q2[6], r0
-; CHECK-NEXT: vmov.u16 r0, q4[7]
-; CHECK-NEXT: vmov.8 q2[7], r0
-; CHECK-NEXT: vmov.u8 r0, q1[8]
-; CHECK-NEXT: vmov.16 q4[0], r0
-; CHECK-NEXT: vmov.u8 r0, q1[9]
-; CHECK-NEXT: vmov.16 q4[1], r0
-; CHECK-NEXT: vmov.u8 r0, q1[10]
-; CHECK-NEXT: vmov.16 q4[2], r0
-; CHECK-NEXT: vmov.u8 r0, q1[11]
-; CHECK-NEXT: vmov.16 q4[3], r0
-; CHECK-NEXT: vmov.u8 r0, q1[12]
-; CHECK-NEXT: vmov.16 q4[4], r0
-; CHECK-NEXT: vmov.u8 r0, q1[13]
-; CHECK-NEXT: vmov.16 q4[5], r0
-; CHECK-NEXT: vmov.u8 r0, q1[14]
-; CHECK-NEXT: vmov.16 q4[6], r0
-; CHECK-NEXT: vmov.u8 r0, q3[8]
-; CHECK-NEXT: vmov.16 q5[0], r0
-; CHECK-NEXT: vmov.u8 r0, q3[9]
-; CHECK-NEXT: vmov.16 q5[1], r0
-; CHECK-NEXT: vmov.u8 r0, q3[10]
-; CHECK-NEXT: vmov.16 q5[2], r0
-; CHECK-NEXT: vmov.u8 r0, q3[11]
-; CHECK-NEXT: vmov.16 q5[3], r0
-; CHECK-NEXT: vmov.u8 r0, q3[12]
-; CHECK-NEXT: vmov.16 q5[4], r0
-; CHECK-NEXT: vmov.u8 r0, q3[13]
-; CHECK-NEXT: vmov.16 q5[5], r0
-; CHECK-NEXT: vmov.u8 r0, q3[14]
-; CHECK-NEXT: vmov.16 q5[6], r0
-; CHECK-NEXT: vmov.u8 r0, q3[15]
-; CHECK-NEXT: vmov.16 q5[7], r0
-; CHECK-NEXT: vmov.u8 r0, q1[15]
-; CHECK-NEXT: vmov.16 q4[7], r0
-; CHECK-NEXT: vadd.i16 q1, q4, q5
-; CHECK-NEXT: vmov.u16 r0, q1[0]
-; CHECK-NEXT: vmov.8 q2[8], r0
-; CHECK-NEXT: vmov.u16 r0, q1[1]
-; CHECK-NEXT: vmov.8 q2[9], r0
-; CHECK-NEXT: vmov.u16 r0, q1[2]
-; CHECK-NEXT: vmov.8 q2[10], r0
-; CHECK-NEXT: vmov.u16 r0, q1[3]
-; CHECK-NEXT: vmov.8 q2[11], r0
-; CHECK-NEXT: vmov.u16 r0, q1[4]
-; CHECK-NEXT: vmov.8 q2[12], r0
-; CHECK-NEXT: vmov.u16 r0, q1[5]
-; CHECK-NEXT: vmov.8 q2[13], r0
-; CHECK-NEXT: vmov.u16 r0, q1[6]
-; CHECK-NEXT: vmov.8 q2[14], r0
-; CHECK-NEXT: vmov.u16 r0, q1[7]
-; CHECK-NEXT: vmov.8 q2[15], r0
-; CHECK-NEXT: vmov q1, q2
-; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: .vsave {d8, d9, d10, d11}
+; CHECK-NEXT: vpush {d8, d9, d10, d11}
+; CHECK-NEXT: vrev16.8 q5, q0
+; CHECK-NEXT: vrev16.8 q4, q2
+; CHECK-NEXT: vadd.i16 q0, q0, q2
+; CHECK-NEXT: vadd.i16 q4, q5, q4
+; CHECK-NEXT: vmovnt.i16 q0, q4
+; CHECK-NEXT: vrev16.8 q4, q1
+; CHECK-NEXT: vrev16.8 q2, q3
+; CHECK-NEXT: vadd.i16 q1, q1, q3
+; CHECK-NEXT: vadd.i16 q2, q4, q2
+; CHECK-NEXT: vmovnt.i16 q1, q2
+; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: bx lr
entry:
%sa = sext <32 x i8> %a to <32 x i16>
@@ -1075,70 +673,31 @@ define arm_aapcs_vfpcc <8 x i16> @ext_ops_trunc_i16(<8 x i16> %a, <8 x i16> %b)
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
-; CHECK-NEXT: vmov.u16 r0, q1[6]
-; CHECK-NEXT: vmov.u16 r1, q1[4]
-; CHECK-NEXT: vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT: vmov.u16 r0, q1[7]
-; CHECK-NEXT: vmov.u16 r1, q1[5]
-; CHECK-NEXT: vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT: vmov.u16 r0, q0[6]
-; CHECK-NEXT: vmov.u16 r1, q0[4]
-; CHECK-NEXT: vmovlb.u16 q2, q2
-; CHECK-NEXT: vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT: vmov.u16 r0, q0[7]
-; CHECK-NEXT: vmov.u16 r1, q0[5]
-; CHECK-NEXT: vneg.s32 q5, q2
-; CHECK-NEXT: vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT: vmov.u16 r0, q1[2]
-; CHECK-NEXT: vmovlb.s16 q3, q3
-; CHECK-NEXT: vmov.u16 r1, q1[0]
+; CHECK-NEXT: vmovlt.u16 q2, q1
+; CHECK-NEXT: vmovlt.s16 q3, q0
; CHECK-NEXT: vadd.i32 q4, q3, q2
-; CHECK-NEXT: vcmp.i32 eq, q3, q2
+; CHECK-NEXT: vneg.s32 q5, q2
; CHECK-NEXT: vshl.s32 q4, q4, q5
; CHECK-NEXT: vneg.s32 q5, q3
-; CHECK-NEXT: vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT: vmov.u16 r0, q1[3]
-; CHECK-NEXT: vmov.u16 r1, q1[1]
; CHECK-NEXT: vsub.i32 q4, q4, q2
-; CHECK-NEXT: vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT: vmov.u16 r0, q0[2]
-; CHECK-NEXT: vmov.u16 r1, q0[0]
+; CHECK-NEXT: vcmp.i32 eq, q3, q2
; CHECK-NEXT: vmul.i32 q4, q4, q2
-; CHECK-NEXT: vmovlb.u16 q1, q3
-; CHECK-NEXT: vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT: vmov.u16 r0, q0[3]
-; CHECK-NEXT: vmov.u16 r1, q0[1]
+; CHECK-NEXT: vmovlb.u16 q1, q1
; CHECK-NEXT: vshl.u32 q4, q4, q5
-; CHECK-NEXT: vmov q3[3], q3[1], r1, r0
+; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vshl.u32 q4, q4, q2
-; CHECK-NEXT: vmovlb.s16 q0, q3
-; CHECK-NEXT: vpsel q2, q4, q2
; CHECK-NEXT: vadd.i32 q3, q0, q1
+; CHECK-NEXT: vpsel q2, q4, q2
; CHECK-NEXT: vneg.s32 q4, q1
-; CHECK-NEXT: vcmp.i32 eq, q0, q1
; CHECK-NEXT: vshl.s32 q3, q3, q4
; CHECK-NEXT: vneg.s32 q4, q0
; CHECK-NEXT: vsub.i32 q3, q3, q1
+; CHECK-NEXT: vcmp.i32 eq, q0, q1
; CHECK-NEXT: vmul.i32 q3, q3, q1
; CHECK-NEXT: vshl.u32 q3, q3, q4
; CHECK-NEXT: vshl.u32 q3, q3, q1
-; CHECK-NEXT: vpsel q1, q3, q1
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: vmov.16 q0[0], r0
-; CHECK-NEXT: vmov r0, s5
-; CHECK-NEXT: vmov.16 q0[1], r0
-; CHECK-NEXT: vmov r0, s6
-; CHECK-NEXT: vmov.16 q0[2], r0
-; CHECK-NEXT: vmov r0, s7
-; CHECK-NEXT: vmov.16 q0[3], r0
-; CHECK-NEXT: vmov r0, s8
-; CHECK-NEXT: vmov.16 q0[4], r0
-; CHECK-NEXT: vmov r0, s9
-; CHECK-NEXT: vmov.16 q0[5], r0
-; CHECK-NEXT: vmov r0, s10
-; CHECK-NEXT: vmov.16 q0[6], r0
-; CHECK-NEXT: vmov r0, s11
-; CHECK-NEXT: vmov.16 q0[7], r0
+; CHECK-NEXT: vpsel q0, q3, q1
+; CHECK-NEXT: vmovnt.i32 q0, q2
; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: bx lr
entry:
@@ -1161,126 +720,31 @@ define arm_aapcs_vfpcc <16 x i8> @ext_ops_trunc_i8(<16 x i8> %a, <16 x i8> %b) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
-; CHECK-NEXT: vmov.u8 r0, q1[8]
-; CHECK-NEXT: vmov.16 q2[0], r0
-; CHECK-NEXT: vmov.u8 r0, q1[9]
-; CHECK-NEXT: vmov.16 q2[1], r0
-; CHECK-NEXT: vmov.u8 r0, q1[10]
-; CHECK-NEXT: vmov.16 q2[2], r0
-; CHECK-NEXT: vmov.u8 r0, q1[11]
-; CHECK-NEXT: vmov.16 q2[3], r0
-; CHECK-NEXT: vmov.u8 r0, q1[12]
-; CHECK-NEXT: vmov.16 q2[4], r0
-; CHECK-NEXT: vmov.u8 r0, q1[13]
-; CHECK-NEXT: vmov.16 q2[5], r0
-; CHECK-NEXT: vmov.u8 r0, q1[14]
-; CHECK-NEXT: vmov.16 q2[6], r0
-; CHECK-NEXT: vmov.u8 r0, q1[15]
-; CHECK-NEXT: vmov.16 q2[7], r0
-; CHECK-NEXT: vmov.u8 r0, q0[8]
-; CHECK-NEXT: vmov.16 q3[0], r0
-; CHECK-NEXT: vmov.u8 r0, q0[9]
-; CHECK-NEXT: vmov.16 q3[1], r0
-; CHECK-NEXT: vmov.u8 r0, q0[10]
-; CHECK-NEXT: vmov.16 q3[2], r0
-; CHECK-NEXT: vmov.u8 r0, q0[11]
-; CHECK-NEXT: vmov.16 q3[3], r0
-; CHECK-NEXT: vmov.u8 r0, q0[12]
-; CHECK-NEXT: vmov.16 q3[4], r0
-; CHECK-NEXT: vmov.u8 r0, q0[13]
-; CHECK-NEXT: vmov.16 q3[5], r0
-; CHECK-NEXT: vmov.u8 r0, q0[14]
-; CHECK-NEXT: vmov.16 q3[6], r0
-; CHECK-NEXT: vmov.u8 r0, q0[15]
-; CHECK-NEXT: vmov.16 q3[7], r0
-; CHECK-NEXT: vmovlb.u8 q2, q2
-; CHECK-NEXT: vmovlb.s8 q3, q3
-; CHECK-NEXT: vneg.s16 q5, q2
+; CHECK-NEXT: vmovlt.u8 q2, q1
+; CHECK-NEXT: vmovlt.s8 q3, q0
; CHECK-NEXT: vadd.i16 q4, q3, q2
-; CHECK-NEXT: vmov.u8 r0, q1[0]
+; CHECK-NEXT: vneg.s16 q5, q2
; CHECK-NEXT: vshl.s16 q4, q4, q5
; CHECK-NEXT: vneg.s16 q5, q3
-; CHECK-NEXT: vcmp.i16 eq, q3, q2
-; CHECK-NEXT: vmov.16 q3[0], r0
-; CHECK-NEXT: vmov.u8 r0, q1[1]
; CHECK-NEXT: vsub.i16 q4, q4, q2
-; CHECK-NEXT: vmov.16 q3[1], r0
-; CHECK-NEXT: vmov.u8 r0, q1[2]
-; CHECK-NEXT: vmov.16 q3[2], r0
-; CHECK-NEXT: vmov.u8 r0, q1[3]
-; CHECK-NEXT: vmov.16 q3[3], r0
-; CHECK-NEXT: vmov.u8 r0, q1[4]
-; CHECK-NEXT: vmov.16 q3[4], r0
-; CHECK-NEXT: vmov.u8 r0, q1[5]
-; CHECK-NEXT: vmov.16 q3[5], r0
-; CHECK-NEXT: vmov.u8 r0, q1[6]
-; CHECK-NEXT: vmov.16 q3[6], r0
-; CHECK-NEXT: vmov.u8 r0, q1[7]
-; CHECK-NEXT: vmov.16 q3[7], r0
-; CHECK-NEXT: vmov.u8 r0, q0[0]
-; CHECK-NEXT: vmovlb.u8 q1, q3
-; CHECK-NEXT: vmov.16 q3[0], r0
-; CHECK-NEXT: vmov.u8 r0, q0[1]
+; CHECK-NEXT: vcmp.i16 eq, q3, q2
; CHECK-NEXT: vmul.i16 q4, q4, q2
-; CHECK-NEXT: vmov.16 q3[1], r0
-; CHECK-NEXT: vmov.u8 r0, q0[2]
-; CHECK-NEXT: vmov.16 q3[2], r0
-; CHECK-NEXT: vmov.u8 r0, q0[3]
-; CHECK-NEXT: vmov.16 q3[3], r0
-; CHECK-NEXT: vmov.u8 r0, q0[4]
-; CHECK-NEXT: vmov.16 q3[4], r0
-; CHECK-NEXT: vmov.u8 r0, q0[5]
-; CHECK-NEXT: vmov.16 q3[5], r0
-; CHECK-NEXT: vmov.u8 r0, q0[6]
-; CHECK-NEXT: vmov.16 q3[6], r0
-; CHECK-NEXT: vmov.u8 r0, q0[7]
+; CHECK-NEXT: vmovlb.u8 q1, q1
; CHECK-NEXT: vshl.u16 q4, q4, q5
-; CHECK-NEXT: vmov.16 q3[7], r0
+; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vshl.u16 q4, q4, q2
-; CHECK-NEXT: vmovlb.s8 q0, q3
-; CHECK-NEXT: vpsel q2, q4, q2
; CHECK-NEXT: vadd.i16 q3, q0, q1
+; CHECK-NEXT: vpsel q2, q4, q2
; CHECK-NEXT: vneg.s16 q4, q1
-; CHECK-NEXT: vcmp.i16 eq, q0, q1
; CHECK-NEXT: vshl.s16 q3, q3, q4
; CHECK-NEXT: vneg.s16 q4, q0
; CHECK-NEXT: vsub.i16 q3, q3, q1
+; CHECK-NEXT: vcmp.i16 eq, q0, q1
; CHECK-NEXT: vmul.i16 q3, q3, q1
; CHECK-NEXT: vshl.u16 q3, q3, q4
; CHECK-NEXT: vshl.u16 q3, q3, q1
-; CHECK-NEXT: vpsel q1, q3, q1
-; CHECK-NEXT: vmov.u16 r0, q1[0]
-; CHECK-NEXT: vmov.8 q0[0], r0
-; CHECK-NEXT: vmov.u16 r0, q1[1]
-; CHECK-NEXT: vmov.8 q0[1], r0
-; CHECK-NEXT: vmov.u16 r0, q1[2]
-; CHECK-NEXT: vmov.8 q0[2], r0
-; CHECK-NEXT: vmov.u16 r0, q1[3]
-; CHECK-NEXT: vmov.8 q0[3], r0
-; CHECK-NEXT: vmov.u16 r0, q1[4]
-; CHECK-NEXT: vmov.8 q0[4], r0
-; CHECK-NEXT: vmov.u16 r0, q1[5]
-; CHECK-NEXT: vmov.8 q0[5], r0
-; CHECK-NEXT: vmov.u16 r0, q1[6]
-; CHECK-NEXT: vmov.8 q0[6], r0
-; CHECK-NEXT: vmov.u16 r0, q1[7]
-; CHECK-NEXT: vmov.8 q0[7], r0
-; CHECK-NEXT: vmov.u16 r0, q2[0]
-; CHECK-NEXT: vmov.8 q0[8], r0
-; CHECK-NEXT: vmov.u16 r0, q2[1]
-; CHECK-NEXT: vmov.8 q0[9], r0
-; CHECK-NEXT: vmov.u16 r0, q2[2]
-; CHECK-NEXT: vmov.8 q0[10], r0
-; CHECK-NEXT: vmov.u16 r0, q2[3]
-; CHECK-NEXT: vmov.8 q0[11], r0
-; CHECK-NEXT: vmov.u16 r0, q2[4]
-; CHECK-NEXT: vmov.8 q0[12], r0
-; CHECK-NEXT: vmov.u16 r0, q2[5]
-; CHECK-NEXT: vmov.8 q0[13], r0
-; CHECK-NEXT: vmov.u16 r0, q2[6]
-; CHECK-NEXT: vmov.8 q0[14], r0
-; CHECK-NEXT: vmov.u16 r0, q2[7]
-; CHECK-NEXT: vmov.8 q0[15], r0
+; CHECK-NEXT: vpsel q0, q3, q1
+; CHECK-NEXT: vmovnt.i16 q0, q2
; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: bx lr
entry:
More information about the llvm-commits
mailing list