[llvm] [AArch64][GlobalISel] Perfect Shuffles (PR #106446)
David Green via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 28 13:17:49 PDT 2024
https://github.com/davemgreen created https://github.com/llvm/llvm-project/pull/106446
This is a port of the existing perfect shuffle generation code from SDAG, following the same structure. I wrote it a while ago and it has been sitting around. It brings the codegen for shuffles inline and avoids the need for generating a tbl and constant pool load.
>From 19c2a5f2695696fd22e3e4d92ea9ad0b8ba13d90 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Wed, 22 May 2024 22:14:49 +0100
Subject: [PATCH] [AArch64][GlobalISel] Perfect Shuffles
This is a port of the existing perfect shuffle generation code from SDAG,
following the same structure. I wrote it a while ago and it has been sitting
on my machine. It brings the codegen for certain shuffles inline and avoids the
need for generating a tbl and constant pool load.
---
.../CodeGen/GlobalISel/MachineIRBuilder.h | 18 ++
llvm/lib/Target/AArch64/AArch64Combine.td | 10 +-
.../Target/AArch64/AArch64ISelLowering.cpp | 19 --
.../Target/AArch64/AArch64PerfectShuffle.h | 19 ++
.../GISel/AArch64PostLegalizerLowering.cpp | 183 ++++++++++++++++++
.../postlegalizer-lowering-shuffle-splat.mir | 12 +-
.../GlobalISel/postlegalizer-lowering-uzp.mir | 15 +-
.../GlobalISel/postlegalizer-lowering-zip.mir | 9 +-
llvm/test/CodeGen/AArch64/arm64-dup.ll | 103 +++-------
llvm/test/CodeGen/AArch64/arm64-neon-copy.ll | 24 +--
llvm/test/CodeGen/AArch64/arm64-rev.ll | 5 +-
.../CodeGen/AArch64/extract-vector-elt.ll | 11 +-
.../AArch64/neon-bitwise-instructions.ll | 13 +-
llvm/test/CodeGen/AArch64/shufflevector.ll | 57 ++----
14 files changed, 319 insertions(+), 179 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index 92e05ee858a755..2a4121e0be1d46 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -1302,6 +1302,24 @@ class MachineIRBuilder {
const SrcOp &Elt,
const SrcOp &Idx);
+ /// Build and insert \p Res = G_INSERT_VECTOR_ELT \p Val, \p Elt, \p Idx
+ ///
+ /// \pre setBasicBlock or setMI must have been called.
+ /// \pre \p Res must be a generic virtual register with scalar type.
+ /// \pre \p Val must be a generic virtual register with vector type.
+ /// \pre \p Elt must be a generic virtual register with scalar type.
+ ///
+ /// \return The newly created instruction.
+ MachineInstrBuilder buildInsertVectorElementConstant(const DstOp &Res,
+ const SrcOp &Val,
+ const SrcOp &Elt,
+ const int Idx) {
+ auto TLI = getMF().getSubtarget().getTargetLowering();
+ unsigned VecIdxWidth = TLI->getVectorIdxTy(getDataLayout()).getSizeInBits();
+ return buildInsertVectorElement(
+ Res, Val, Elt, buildConstant(LLT::scalar(VecIdxWidth), Idx));
+ }
+
/// Build and insert \p Res = G_EXTRACT_VECTOR_ELT \p Val, \p Idx
///
/// \pre setBasicBlock or setMI must have been called.
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 10cad6d1924407..3541ca423c0d1d 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -129,6 +129,13 @@ def shuf_to_ins: GICombineRule <
(apply [{ applyINS(*${root}, MRI, B, ${matchinfo}); }])
>;
+def perfect_shuffle: GICombineRule <
+ (defs root:$root),
+ (match (wip_match_opcode G_SHUFFLE_VECTOR):$root,
+ [{ return matchPerfectShuffle(*${root}, MRI); }]),
+ (apply [{ applyPerfectShuffle(*${root}, MRI, B); }])
+>;
+
def vashr_vlshr_imm_matchdata : GIDefMatchData<"int64_t">;
def vashr_vlshr_imm : GICombineRule<
(defs root:$root, vashr_vlshr_imm_matchdata:$matchinfo),
@@ -147,7 +154,8 @@ def form_duplane : GICombineRule <
>;
def shuffle_vector_lowering : GICombineGroup<[dup, rev, ext, zip, uzp, trn,
- form_duplane, shuf_to_ins]>;
+ form_duplane, shuf_to_ins,
+ perfect_shuffle]>;
// Turn G_UNMERGE_VALUES -> G_EXTRACT_VECTOR_ELT's
def vector_unmerge_lowering : GICombineRule <
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e31a27e9428e8a..2100d5d87e90bc 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -12090,25 +12090,6 @@ static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1,
unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
- enum {
- OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
- OP_VREV,
- OP_VDUP0,
- OP_VDUP1,
- OP_VDUP2,
- OP_VDUP3,
- OP_VEXT1,
- OP_VEXT2,
- OP_VEXT3,
- OP_VUZPL, // VUZP, left result
- OP_VUZPR, // VUZP, right result
- OP_VZIPL, // VZIP, left result
- OP_VZIPR, // VZIP, right result
- OP_VTRNL, // VTRN, left result
- OP_VTRNR, // VTRN, right result
- OP_MOVLANE // Move lane. RHSID is the lane to move into
- };
-
if (OpNum == OP_COPY) {
if (LHSID == (1 * 9 + 2) * 9 + 3)
return LHS;
diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
index 7b044cf7c238fd..3691b4787b1b66 100644
--- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
+++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -6588,6 +6588,25 @@ static const unsigned PerfectShuffleTable[6561 + 1] = {
835584U, // <u,u,u,u>: Cost 0 copy LHS
0};
+enum {
+ OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+ OP_VREV,
+ OP_VDUP0,
+ OP_VDUP1,
+ OP_VDUP2,
+ OP_VDUP3,
+ OP_VEXT1,
+ OP_VEXT2,
+ OP_VEXT3,
+ OP_VUZPL, // VUZP, left result
+ OP_VUZPR, // VUZP, right result
+ OP_VZIPL, // VZIP, left result
+ OP_VZIPR, // VZIP, right result
+ OP_VTRNL, // VTRN, left result
+ OP_VTRNR, // VTRN, right result
+ OP_MOVLANE // Move lane. RHSID is the lane to move into
+};
+
inline unsigned getPerfectShuffleCost(llvm::ArrayRef<int> M) {
assert(M.size() == 4 && "Expected a 4 entry perfect shuffle");
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 77b8cbe5793c39..7e63c4e8a7fe9f 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -504,6 +504,189 @@ void applyINS(MachineInstr &MI, MachineRegisterInfo &MRI,
MI.eraseFromParent();
}
+/// Match 4 elemental G_SHUFFLE_VECTOR
+bool matchPerfectShuffle(MachineInstr &MI, MachineRegisterInfo &MRI) {
+ assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+ return MRI.getType(MI.getOperand(0).getReg()).getNumElements() == 4;
+}
+
+static Register GeneratePerfectShuffle(unsigned ID, Register V1, Register V2,
+ unsigned PFEntry, Register LHS,
+ Register RHS, MachineIRBuilder &MIB,
+ MachineRegisterInfo &MRI) {
+ unsigned OpNum = (PFEntry >> 26) & 0x0F;
+ unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
+ unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
+
+ if (OpNum == OP_COPY) {
+ if (LHSID == (1 * 9 + 2) * 9 + 3)
+ return LHS;
+ assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
+ return RHS;
+ }
+
+ if (OpNum == OP_MOVLANE) {
+ // Decompose a PerfectShuffle ID to get the Mask for lane Elt
+ auto getPFIDLane = [](unsigned ID, int Elt) -> int {
+ assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
+ Elt = 3 - Elt;
+ while (Elt > 0) {
+ ID /= 9;
+ Elt--;
+ }
+ return (ID % 9 == 8) ? -1 : ID % 9;
+ };
+
+ // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
+ // get the lane to move from the PFID, which is always from the
+ // original vectors (V1 or V2).
+ Register OpLHS = GeneratePerfectShuffle(
+ LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, MIB, MRI);
+ LLT VT = MRI.getType(OpLHS);
+ assert(RHSID < 8 && "Expected a lane index for RHSID!");
+ unsigned ExtLane = 0;
+ Register Input;
+
+ // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
+ // convert into a higher type.
+ if (RHSID & 0x4) {
+ int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
+ if (MaskElt == -1)
+ MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
+ assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
+ ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
+ Input = MaskElt < 2 ? V1 : V2;
+ if (VT.getScalarSizeInBits() == 16 && VT != LLT::fixed_vector(2, 32)) {
+ Input = MIB.buildInstr(TargetOpcode::G_BITCAST,
+ {LLT::fixed_vector(2, 32)}, {Input})
+ .getReg(0);
+ OpLHS = MIB.buildInstr(TargetOpcode::G_BITCAST,
+ {LLT::fixed_vector(2, 32)}, {OpLHS})
+ .getReg(0);
+ }
+ if (VT.getScalarSizeInBits() == 32 && VT != LLT::fixed_vector(2, 64)) {
+ Input = MIB.buildInstr(TargetOpcode::G_BITCAST,
+ {LLT::fixed_vector(2, 64)}, {Input})
+ .getReg(0);
+ OpLHS = MIB.buildInstr(TargetOpcode::G_BITCAST,
+ {LLT::fixed_vector(2, 64)}, {OpLHS})
+ .getReg(0);
+ }
+ } else {
+ int MaskElt = getPFIDLane(ID, RHSID);
+ assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
+ ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
+ Input = MaskElt < 4 ? V1 : V2;
+ if (VT.getScalarSizeInBits() == 16 && VT != LLT::fixed_vector(4, 16)) {
+ Input = MIB.buildInstr(TargetOpcode::G_BITCAST,
+ {LLT::fixed_vector(4, 16)}, {Input})
+ .getReg(0);
+ OpLHS = MIB.buildInstr(TargetOpcode::G_BITCAST,
+ {LLT::fixed_vector(4, 16)}, {OpLHS})
+ .getReg(0);
+ }
+ }
+ auto Ext = MIB.buildExtractVectorElementConstant(
+ MRI.getType(Input).getElementType(), Input, ExtLane);
+ auto Ins = MIB.buildInsertVectorElementConstant(MRI.getType(Input), OpLHS,
+ Ext, RHSID & 0x3);
+ if (MRI.getType(Ins.getReg(0)) != VT)
+ return MIB.buildInstr(TargetOpcode::G_BITCAST, {VT}, {Ins}).getReg(0);
+ return Ins.getReg(0);
+ }
+
+ Register OpLHS, OpRHS;
+ OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
+ RHS, MIB, MRI);
+ OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
+ RHS, MIB, MRI);
+ LLT VT = MRI.getType(OpLHS);
+
+ switch (OpNum) {
+ default:
+ llvm_unreachable("Unknown shuffle opcode!");
+ case OP_VREV: {
+ // VREV divides the vector in half and swaps within the half.
+ unsigned Opcode = VT.getScalarSizeInBits() == 32 ? AArch64::G_REV64
+ : VT.getScalarSizeInBits() == 16 ? AArch64::G_REV32
+ : AArch64::G_REV16;
+ return MIB.buildInstr(Opcode, {VT}, {OpLHS}).getReg(0);
+ }
+ case OP_VDUP0:
+ case OP_VDUP1:
+ case OP_VDUP2:
+ case OP_VDUP3: {
+ unsigned Opcode;
+ if (VT.getScalarSizeInBits() == 8)
+ Opcode = AArch64::G_DUPLANE8;
+ else if (VT.getScalarSizeInBits() == 16)
+ Opcode = AArch64::G_DUPLANE16;
+ else if (VT.getScalarSizeInBits() == 32)
+ Opcode = AArch64::G_DUPLANE32;
+ else if (VT.getScalarSizeInBits() == 64)
+ Opcode = AArch64::G_DUPLANE64;
+ else
+ llvm_unreachable("Invalid vector element type?");
+
+ if (VT.getSizeInBits() == 64)
+ OpLHS = MIB.buildConcatVectors(
+ VT.changeElementCount(VT.getElementCount() * 2),
+ {OpLHS, MIB.buildUndef(VT).getReg(0)})
+ .getReg(0);
+ Register Lane =
+ MIB.buildConstant(LLT::scalar(64), OpNum - OP_VDUP0).getReg(0);
+ return MIB.buildInstr(Opcode, {VT}, {OpLHS, Lane}).getReg(0);
+ }
+ case OP_VEXT1:
+ case OP_VEXT2:
+ case OP_VEXT3: {
+ unsigned Imm = (OpNum - OP_VEXT1 + 1) * VT.getScalarSizeInBits() / 8;
+ return MIB
+ .buildInstr(AArch64::G_EXT, {VT},
+ {OpLHS, OpRHS, MIB.buildConstant(LLT::scalar(64), Imm)})
+ .getReg(0);
+ }
+ case OP_VUZPL:
+ return MIB.buildInstr(AArch64::G_UZP1, {VT}, {OpLHS, OpRHS}).getReg(0);
+ case OP_VUZPR:
+ return MIB.buildInstr(AArch64::G_UZP2, {VT}, {OpLHS, OpRHS}).getReg(0);
+ case OP_VZIPL:
+ return MIB.buildInstr(AArch64::G_ZIP1, {VT}, {OpLHS, OpRHS}).getReg(0);
+ case OP_VZIPR:
+ return MIB.buildInstr(AArch64::G_ZIP2, {VT}, {OpLHS, OpRHS}).getReg(0);
+ case OP_VTRNL:
+ return MIB.buildInstr(AArch64::G_TRN1, {VT}, {OpLHS, OpRHS}).getReg(0);
+ case OP_VTRNR:
+ return MIB.buildInstr(AArch64::G_TRN2, {VT}, {OpLHS, OpRHS}).getReg(0);
+ }
+}
+
+void applyPerfectShuffle(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &Builder) {
+ Register Dst = MI.getOperand(0).getReg();
+ Register LHS = MI.getOperand(1).getReg();
+ Register RHS = MI.getOperand(2).getReg();
+ ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
+ assert(ShuffleMask.size() == 4 && "Expected 4 element mask");
+
+ unsigned PFIndexes[4];
+ for (unsigned i = 0; i != 4; ++i) {
+ if (ShuffleMask[i] < 0)
+ PFIndexes[i] = 8;
+ else
+ PFIndexes[i] = ShuffleMask[i];
+ }
+
+ // Compute the index in the perfect shuffle table.
+ unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
+ PFIndexes[2] * 9 + PFIndexes[3];
+ unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+ Register Res = GeneratePerfectShuffle(PFTableIndex, LHS, RHS, PFEntry, LHS,
+ RHS, Builder, MRI);
+ Builder.buildCopy(Dst, Res);
+ MI.eraseFromParent();
+}
+
/// isVShiftRImm - Check if this is a valid vector for the immediate
/// operand of a vector shift right operation. The value must be in the range:
/// 1 <= Value <= ElementBits for a right shift.
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir
index 9d12c3c32c7f8b..c426bb957d704d 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir
@@ -280,8 +280,16 @@ body: |
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s32), [[C]](s64)
- ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[IVEC]](<4 x s32>), [[DEF]], shufflemask(undef, 0, 0, 3)
- ; CHECK-NEXT: $q0 = COPY [[SHUF]](<4 x s32>)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[IVEC]](<4 x s32>), [[C1]](s64)
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+ ; CHECK-NEXT: [[IVEC1:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], [[EVEC]](s32), [[C2]](s64)
+ ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[IVEC]](<4 x s32>), [[C3]](s64)
+ ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+ ; CHECK-NEXT: [[IVEC2:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC1]], [[EVEC1]](s32), [[C4]](s64)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY [[IVEC2]](<4 x s32>)
+ ; CHECK-NEXT: $q0 = COPY [[COPY1]](<4 x s32>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(s32) = COPY $s0
%2:_(<4 x s32>) = G_IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-uzp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-uzp.mir
index d1d5c6c29ba0df..d466df2a55c537 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-uzp.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-uzp.mir
@@ -67,8 +67,10 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
- ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[COPY1]], shufflemask(0, 1, 4, 6)
- ; CHECK-NEXT: $q0 = COPY [[SHUF]](<4 x s32>)
+ ; CHECK-NEXT: [[ZIP1_:%[0-9]+]]:_(<4 x s32>) = G_ZIP1 [[COPY]], [[COPY1]]
+ ; CHECK-NEXT: [[UZP1_:%[0-9]+]]:_(<4 x s32>) = G_UZP1 [[ZIP1_]], [[COPY1]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY [[UZP1_]](<4 x s32>)
+ ; CHECK-NEXT: $q0 = COPY [[COPY2]](<4 x s32>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(<4 x s32>) = COPY $q0
%1:_(<4 x s32>) = COPY $q1
@@ -92,8 +94,13 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
- ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[COPY1]], shufflemask(1, 4, 5, 7)
- ; CHECK-NEXT: $q0 = COPY [[SHUF]](<4 x s32>)
+ ; CHECK-NEXT: [[UZP2_:%[0-9]+]]:_(<4 x s32>) = G_UZP2 [[COPY]], [[COPY1]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s32>), [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[UZP2_]], [[EVEC]](s32), [[C1]](s64)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY [[IVEC]](<4 x s32>)
+ ; CHECK-NEXT: $q0 = COPY [[COPY2]](<4 x s32>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(<4 x s32>) = COPY $q0
%1:_(<4 x s32>) = COPY $q1
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-zip.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-zip.mir
index bcf088287f46ae..afd5eaa8867bc4 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-zip.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-zip.mir
@@ -220,8 +220,13 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
- ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[COPY1]], shufflemask(3, 4, 1, 5)
- ; CHECK-NEXT: $q0 = COPY [[SHUF]](<4 x s32>)
+ ; CHECK-NEXT: [[ZIP1_:%[0-9]+]]:_(<4 x s32>) = G_ZIP1 [[COPY]], [[COPY1]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+ ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[ZIP1_]], [[EVEC]](s32), [[C1]](s64)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY [[IVEC]](<4 x s32>)
+ ; CHECK-NEXT: $q0 = COPY [[COPY2]](<4 x s32>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(<4 x s32>) = COPY $q0
%1:_(<4 x s32>) = COPY $q1
diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll
index 2bf5419e54830b..50520ce6987fab 100644
--- a/llvm/test/CodeGen/AArch64/arm64-dup.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll
@@ -408,85 +408,45 @@ entry:
; Also test the DUP path in the PerfectShuffle generator.
define <4 x i16> @test_perfectshuffle_dupext_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
-; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4i16:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: trn1.4h v0, v0, v0
-; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-SD-NEXT: mov.s v0[1], v1[0]
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4i16:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: adrp x8, .LCPI33_0
-; CHECK-GI-NEXT: mov.d v0[1], v1[0]
-; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI33_0]
-; CHECK-GI-NEXT: tbl.16b v0, { v0 }, v1
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_perfectshuffle_dupext_v4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: trn1.4h v0, v0, v0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov.s v0[1], v1[0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
%r = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
ret <4 x i16> %r
}
define <4 x half> @test_perfectshuffle_dupext_v4f16(<4 x half> %a, <4 x half> %b) nounwind {
-; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4f16:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: trn1.4h v0, v0, v0
-; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-SD-NEXT: mov.s v0[1], v1[0]
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4f16:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: adrp x8, .LCPI34_0
-; CHECK-GI-NEXT: mov.d v0[1], v1[0]
-; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI34_0]
-; CHECK-GI-NEXT: tbl.16b v0, { v0 }, v1
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_perfectshuffle_dupext_v4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: trn1.4h v0, v0, v0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov.s v0[1], v1[0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
%r = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
ret <4 x half> %r
}
define <4 x i32> @test_perfectshuffle_dupext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
-; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4i32:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: trn1.4s v0, v0, v0
-; CHECK-SD-NEXT: mov.d v0[1], v1[0]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4i32:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: adrp x8, .LCPI35_0
-; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI35_0]
-; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_perfectshuffle_dupext_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: trn1.4s v0, v0, v0
+; CHECK-NEXT: mov.d v0[1], v1[0]
+; CHECK-NEXT: ret
%r = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
ret <4 x i32> %r
}
define <4 x float> @test_perfectshuffle_dupext_v4f32(<4 x float> %a, <4 x float> %b) nounwind {
-; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4f32:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: trn1.4s v0, v0, v0
-; CHECK-SD-NEXT: mov.d v0[1], v1[0]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4f32:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: adrp x8, .LCPI36_0
-; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI36_0]
-; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_perfectshuffle_dupext_v4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: trn1.4s v0, v0, v0
+; CHECK-NEXT: mov.d v0[1], v1[0]
+; CHECK-NEXT: ret
%r = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
ret <4 x float> %r
}
@@ -503,15 +463,12 @@ define void @disguised_dup(<4 x float> %x, ptr %p1, ptr %p2) {
;
; CHECK-GI-LABEL: disguised_dup:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: adrp x8, .LCPI37_1
-; CHECK-GI-NEXT: // kill: def $q0 killed $q0 def $q0_q1
-; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI37_1]
-; CHECK-GI-NEXT: adrp x8, .LCPI37_0
-; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2
-; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI37_0]
-; CHECK-GI-NEXT: tbl.16b v2, { v0, v1 }, v2
-; CHECK-GI-NEXT: str q0, [x0]
-; CHECK-GI-NEXT: str q2, [x1]
+; CHECK-GI-NEXT: ext.16b v1, v0, v0, #4
+; CHECK-GI-NEXT: mov.s v1[2], v0[0]
+; CHECK-GI-NEXT: zip2.4s v0, v1, v1
+; CHECK-GI-NEXT: str q1, [x0]
+; CHECK-GI-NEXT: ext.16b v0, v1, v0, #12
+; CHECK-GI-NEXT: str q0, [x1]
; CHECK-GI-NEXT: ret
%shuf = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 0>
%dup = shufflevector <4 x float> %shuf, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index 43d5ab5ab54e10..36660160a2a661 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -2138,19 +2138,10 @@ entry:
}
define <4 x i32> @test_concat_v4i32_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) #0 {
-; CHECK-SD-LABEL: test_concat_v4i32_v4i32_v4i32:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_concat_v4i32_v4i32_v4i32:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: adrp x8, .LCPI134_0
-; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI134_0]
-; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_concat_v4i32_v4i32_v4i32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: ret
entry:
%vecinit6 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
ret <4 x i32> %vecinit6
@@ -2165,13 +2156,10 @@ define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 {
;
; CHECK-GI-LABEL: test_concat_v4i32_v2i32_v4i32:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT: adrp x8, .LCPI135_0
-; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov s2, v0.s[1]
; CHECK-GI-NEXT: mov v0.s[1], v2.s[0]
-; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI135_0]
-; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: ret
entry:
%vecext = extractelement <2 x i32> %x, i32 0
diff --git a/llvm/test/CodeGen/AArch64/arm64-rev.ll b/llvm/test/CodeGen/AArch64/arm64-rev.ll
index f548a0e01feee6..94b1a330044b21 100644
--- a/llvm/test/CodeGen/AArch64/arm64-rev.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-rev.ll
@@ -426,10 +426,9 @@ define void @float_vrev64(ptr nocapture %source, ptr nocapture %dest) nounwind n
; CHECK-GI-LABEL: float_vrev64:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi d0, #0000000000000000
-; CHECK-GI-NEXT: adrp x8, .LCPI28_0
; CHECK-GI-NEXT: ldr q1, [x0]
-; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI28_0]
-; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2
+; CHECK-GI-NEXT: dup.4s v0, v0[0]
+; CHECK-GI-NEXT: mov.s v0[1], v1[3]
; CHECK-GI-NEXT: str q0, [x1, #176]
; CHECK-GI-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
index 0481d997d24faf..acf1d6d2741f42 100644
--- a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
@@ -919,15 +919,12 @@ define i32 @extract_v4i32_shuffle(<4 x i32> %a, <4 x i32> %b, i32 %c) {
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: sub sp, sp, #16
; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
-; CHECK-GI-NEXT: adrp x8, .LCPI35_0
-; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT: mov x9, sp
-; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI35_0]
-; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT: uzp1 v1.4s, v0.4s, v1.4s
; CHECK-GI-NEXT: mov w8, w0
+; CHECK-GI-NEXT: mov x9, sp
; CHECK-GI-NEXT: and x8, x8, #0x3
-; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-GI-NEXT: str q0, [sp]
+; CHECK-GI-NEXT: mov v1.s[3], v0.s[3]
+; CHECK-GI-NEXT: str q1, [sp]
; CHECK-GI-NEXT: ldr w0, [x9, x8, lsl #2]
; CHECK-GI-NEXT: add sp, sp, #16
; CHECK-GI-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
index 50c0c8b11e7517..b1b1d7f2d933ef 100644
--- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -1446,10 +1446,8 @@ define <4 x i16> @vselect_equivalent_shuffle_v4i16(<4 x i16> %a, <4 x i16> %b) {
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: adrp x8, .LCPI95_0
-; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI95_0]
-; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-GI-NEXT: mov v0.h[2], v1.h[1]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
%c = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 3>
@@ -1465,11 +1463,8 @@ define <4 x i32> @vselect_equivalent_shuffle_v4i32(<4 x i32> %a, <4 x i32> %b) {
;
; CHECK-GI-LABEL: vselect_equivalent_shuffle_v4i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: adrp x8, .LCPI96_0
-; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI96_0]
-; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
; CHECK-GI-NEXT: ret
%c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 3>
ret <4 x i32> %c
diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll
index b1131f287fe9a9..eeca54c0410336 100644
--- a/llvm/test/CodeGen/AArch64/shufflevector.ll
+++ b/llvm/test/CodeGen/AArch64/shufflevector.ll
@@ -2,8 +2,8 @@
; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-; CHECK-GI: warning: Instruction selection used fallback path for shufflevector_v2i1
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shufflevector_v2i1_zeroes
+; CHECK-GI: warning: Instruction selection used fallback path for shufflevector_v2i1
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shufflevector_v2i1_zeroes
; ===== Legal Vector Types =====
@@ -282,12 +282,10 @@ define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: mov s2, v0.s[1]
; CHECK-GI-NEXT: mov s3, v1.s[1]
-; CHECK-GI-NEXT: adrp x8, .LCPI17_0
; CHECK-GI-NEXT: mov v0.h[1], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
-; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI17_0]
-; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-GI-NEXT: zip1 v1.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT: trn2 v0.4h, v0.4h, v1.4h
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
%c = shufflevector <2 x i16> %a, <2 x i16> %b, <2 x i32> <i32 1, i32 2>
@@ -343,12 +341,9 @@ define <8 x i32> @shufflevector_v8i32(<8 x i32> %a, <8 x i32> %b) {
;
; CHECK-GI-LABEL: shufflevector_v8i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: adrp x8, .LCPI20_0
-; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
; CHECK-GI-NEXT: uzp2 v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI20_0]
-; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
-; CHECK-GI-NEXT: tbl v1.16b, { v2.16b, v3.16b }, v4.16b
+; CHECK-GI-NEXT: uzp1 v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: mov v1.s[3], v3.s[3]
; CHECK-GI-NEXT: ret
%c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15>
ret <8 x i32> %c
@@ -542,22 +537,11 @@ define <7 x i8> @shufflevector_v7i8(<7 x i8> %a, <7 x i8> %b) {
}
define <3 x i16> @shufflevector_v3i16(<3 x i16> %a, <3 x i16> %b) {
-; CHECK-SD-LABEL: shufflevector_v3i16:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: zip1 v1.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT: zip2 v0.4h, v1.4h, v0.4h
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: shufflevector_v3i16:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: adrp x8, .LCPI32_0
-; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI32_0]
-; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: shufflevector_v3i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: zip1 v1.4h, v0.4h, v1.4h
+; CHECK-NEXT: zip2 v0.4h, v1.4h, v0.4h
+; CHECK-NEXT: ret
%c = shufflevector <3 x i16> %a, <3 x i16> %b, <3 x i32> <i32 1, i32 2, i32 4>
ret <3 x i16> %c
}
@@ -585,20 +569,11 @@ define <7 x i16> @shufflevector_v7i16(<7 x i16> %a, <7 x i16> %b) {
}
define <3 x i32> @shufflevector_v3i32(<3 x i32> %a, <3 x i32> %b) {
-; CHECK-SD-LABEL: shufflevector_v3i32:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: zip1 v1.4s, v0.4s, v1.4s
-; CHECK-SD-NEXT: zip2 v0.4s, v1.4s, v0.4s
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: shufflevector_v3i32:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: adrp x8, .LCPI34_0
-; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI34_0]
-; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: shufflevector_v3i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: zip1 v1.4s, v0.4s, v1.4s
+; CHECK-NEXT: zip2 v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: ret
%c = shufflevector <3 x i32> %a, <3 x i32> %b, <3 x i32> <i32 1, i32 2, i32 4>
ret <3 x i32> %c
}
More information about the llvm-commits
mailing list