[llvm] ec93b28 - [AArch64] Lower 3 and 4 sources buildvectors to TBL
David Green via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 24 03:02:38 PDT 2022
Author: David Green
Date: 2022-03-24T10:02:33Z
New Revision: ec93b28909749619dbe58b092a13da9d1ff1eb1e
URL: https://github.com/llvm/llvm-project/commit/ec93b28909749619dbe58b092a13da9d1ff1eb1e
DIFF: https://github.com/llvm/llvm-project/commit/ec93b28909749619dbe58b092a13da9d1ff1eb1e.diff
LOG: [AArch64] Lower 3 and 4 sources buildvectors to TBL
The default expansion for buildvectors is to extract each element and
insert them into a new vector. That involves a lot of copying to/from
the GPR registers. TLB3 and TLB4 can be relatively slow instructions
with the mask needing to be loaded from a constant pool, but they should
always be better than all the moves to/from GPRs.
Differential Revision: https://reviews.llvm.org/D121137
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
llvm/test/CodeGen/AArch64/neon-extracttruncate.ll
llvm/test/CodeGen/AArch64/shuffle-tbl34.ll
llvm/test/CodeGen/AArch64/tbl-loops.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f2c51e1c68af5..c9cd9c69f8e81 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9079,10 +9079,72 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
Source->MaxElt = std::max(Source->MaxElt, EltNo);
}
+ // If we have 3 or 4 sources, try to generate a TBL, which will at least be
+ // better than moving to/from gpr registers for larger vectors.
+ if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
+ // Construct a mask for the tbl. We may need to adjust the index for types
+ // larger than i8.
+ SmallVector<unsigned, 16> Mask;
+ unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
+ for (unsigned I = 0; I < NumElts; ++I) {
+ SDValue V = Op.getOperand(I);
+ if (V.isUndef()) {
+ for (unsigned OF = 0; OF < OutputFactor; OF++)
+ Mask.push_back(-1);
+ continue;
+ }
+ // Set the Mask lanes adjusted for the size of the input and output
+ // lanes. The Mask is always i8, so it will set OutputFactor lanes per
+ // output element, adjusted in their positions per input and output types.
+ unsigned Lane = V.getConstantOperandVal(1);
+ for (unsigned S = 0; S < Sources.size(); S++) {
+ if (V.getOperand(0) == Sources[S].Vec) {
+ unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
+ unsigned InputBase = 16 * S + Lane * InputSize / (8 * OutputFactor);
+ for (unsigned OF = 0; OF < OutputFactor; OF++)
+ Mask.push_back(InputBase + OF);
+ break;
+ }
+ }
+ }
+
+ // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
+ // v16i8, and the TBLMask
+ SmallVector<SDValue, 16> TBLOperands;
+ TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
+ ? Intrinsic::aarch64_neon_tbl3
+ : Intrinsic::aarch64_neon_tbl4,
+ dl, MVT::i32));
+ for (unsigned i = 0; i < Sources.size(); i++) {
+ SDValue Src = Sources[i].Vec;
+ EVT SrcVT = Src.getValueType();
+ Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
+ assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
+ "Expected a legally typed vector");
+ if (SrcVT.is64BitVector())
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
+ DAG.getUNDEF(MVT::v8i8));
+ TBLOperands.push_back(Src);
+ }
+
+ SmallVector<SDValue, 16> TBLMask;
+ for (unsigned i = 0; i < Mask.size(); i++)
+ TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
+ assert((Mask.size() == 8 || Mask.size() == 16) &&
+ "Expected a v8i8 or v16i8 Mask");
+ TBLOperands.push_back(
+ DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
+
+ SDValue Shuffle =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
+ Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
+ return DAG.getBitcast(VT, Shuffle);
+ }
+
if (Sources.size() > 2) {
- LLVM_DEBUG(
- dbgs() << "Reshuffle failed: currently only do something sane when at "
- "most two source vectors are involved\n");
+ LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
+ << "sensible when at most two source vectors are "
+ << "involved\n");
return SDValue();
}
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index 244c65312e0ec..c0e3f23b4b9df 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -3321,75 +3321,63 @@ define <16 x i16> @test_signed_v16f16_v16i16(<16 x half> %f) {
define <8 x i8> @test_signed_v8f64_v8i8(<8 x double> %f) {
; CHECK-LABEL: test_signed_v8f64_v8i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov d4, v0.d[1]
+; CHECK-NEXT: mov d4, v3.d[1]
; CHECK-NEXT: mov w8, #127
-; CHECK-NEXT: fcvtzs w11, d0
-; CHECK-NEXT: mov w9, #-128
-; CHECK-NEXT: mov d0, v2.d[1]
-; CHECK-NEXT: fcvtzs w13, d1
-; CHECK-NEXT: fcvtzs w15, d3
-; CHECK-NEXT: fcvtzs w10, d4
-; CHECK-NEXT: mov d4, v1.d[1]
-; CHECK-NEXT: mov d1, v3.d[1]
-; CHECK-NEXT: fcvtzs w14, d0
+; CHECK-NEXT: fcvtzs w10, d3
+; CHECK-NEXT: mov w11, #-128
+; CHECK-NEXT: mov d3, v1.d[1]
+; CHECK-NEXT: fcvtzs w13, d2
+; CHECK-NEXT: fcvtzs w15, d1
+; CHECK-NEXT: mov d1, v0.d[1]
+; CHECK-NEXT: fcvtzs w9, d4
+; CHECK-NEXT: mov d4, v2.d[1]
+; CHECK-NEXT: fcvtzs w14, d3
+; CHECK-NEXT: cmp w9, #127
+; CHECK-NEXT: csel w9, w9, w8, lt
+; CHECK-NEXT: fcvtzs w12, d4
+; CHECK-NEXT: cmn w9, #128
+; CHECK-NEXT: csel w9, w9, w11, gt
; CHECK-NEXT: cmp w10, #127
; CHECK-NEXT: csel w10, w10, w8, lt
-; CHECK-NEXT: fcvtzs w12, d4
; CHECK-NEXT: cmn w10, #128
-; CHECK-NEXT: csel w10, w10, w9, gt
-; CHECK-NEXT: cmp w11, #127
-; CHECK-NEXT: csel w11, w11, w8, lt
-; CHECK-NEXT: cmn w11, #128
-; CHECK-NEXT: csel w11, w11, w9, gt
+; CHECK-NEXT: csel w10, w10, w11, gt
; CHECK-NEXT: cmp w12, #127
; CHECK-NEXT: csel w12, w12, w8, lt
; CHECK-NEXT: cmn w12, #128
-; CHECK-NEXT: csel w12, w12, w9, gt
-; CHECK-NEXT: cmp w13, #127
-; CHECK-NEXT: fmov s0, w11
-; CHECK-NEXT: csel w11, w13, w8, lt
-; CHECK-NEXT: cmn w11, #128
-; CHECK-NEXT: fcvtzs w13, d2
-; CHECK-NEXT: csel w11, w11, w9, gt
-; CHECK-NEXT: cmp w14, #127
-; CHECK-NEXT: mov v0.s[1], w10
-; CHECK-NEXT: csel w10, w14, w8, lt
-; CHECK-NEXT: cmn w10, #128
-; CHECK-NEXT: fmov s2, w11
-; CHECK-NEXT: csel w10, w10, w9, gt
+; CHECK-NEXT: csel w12, w12, w11, gt
; CHECK-NEXT: cmp w13, #127
-; CHECK-NEXT: mov w11, v0.s[1]
; CHECK-NEXT: csel w13, w13, w8, lt
-; CHECK-NEXT: mov v2.s[1], w12
+; CHECK-NEXT: fmov s5, w10
; CHECK-NEXT: cmn w13, #128
-; CHECK-NEXT: fcvtzs w12, d1
-; CHECK-NEXT: csel w13, w13, w9, gt
-; CHECK-NEXT: mov v0.b[1], w11
-; CHECK-NEXT: fmov w14, s2
-; CHECK-NEXT: cmp w12, #127
-; CHECK-NEXT: fmov s1, w13
-; CHECK-NEXT: csel w12, w12, w8, lt
-; CHECK-NEXT: cmn w12, #128
-; CHECK-NEXT: mov w11, v2.s[1]
-; CHECK-NEXT: mov v0.b[2], w14
-; CHECK-NEXT: csel w12, w12, w9, gt
+; CHECK-NEXT: csel w13, w13, w11, gt
+; CHECK-NEXT: cmp w14, #127
+; CHECK-NEXT: csel w14, w14, w8, lt
+; CHECK-NEXT: cmn w14, #128
+; CHECK-NEXT: csel w10, w14, w11, gt
; CHECK-NEXT: cmp w15, #127
-; CHECK-NEXT: mov v1.s[1], w10
+; CHECK-NEXT: fcvtzs w14, d1
+; CHECK-NEXT: csel w15, w15, w8, lt
+; CHECK-NEXT: cmn w15, #128
+; CHECK-NEXT: mov v5.s[1], w9
+; CHECK-NEXT: csel w9, w15, w11, gt
+; CHECK-NEXT: cmp w14, #127
+; CHECK-NEXT: fcvtzs w15, d0
+; CHECK-NEXT: fmov s4, w13
+; CHECK-NEXT: csel w13, w14, w8, lt
+; CHECK-NEXT: cmn w13, #128
+; CHECK-NEXT: csel w13, w13, w11, gt
+; CHECK-NEXT: cmp w15, #127
+; CHECK-NEXT: mov v4.s[1], w12
; CHECK-NEXT: csel w8, w15, w8, lt
+; CHECK-NEXT: fmov s3, w9
; CHECK-NEXT: cmn w8, #128
-; CHECK-NEXT: csel w8, w8, w9, gt
-; CHECK-NEXT: mov v0.b[3], w11
-; CHECK-NEXT: fmov w9, s1
+; CHECK-NEXT: csel w8, w8, w11, gt
+; CHECK-NEXT: mov v3.s[1], w10
; CHECK-NEXT: fmov s2, w8
-; CHECK-NEXT: mov w8, v1.s[1]
-; CHECK-NEXT: mov v0.b[4], w9
-; CHECK-NEXT: mov v2.s[1], w12
-; CHECK-NEXT: mov v0.b[5], w8
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov w9, v2.s[1]
-; CHECK-NEXT: mov v0.b[6], w8
-; CHECK-NEXT: mov v0.b[7], w9
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: adrp x8, .LCPI82_0
+; CHECK-NEXT: mov v2.s[1], w13
+; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI82_0]
+; CHECK-NEXT: tbl v0.8b, { v2.16b, v3.16b, v4.16b, v5.16b }, v0.8b
; CHECK-NEXT: ret
%x = call <8 x i8> @llvm.fptosi.sat.v8f64.v8i8(<8 x double> %f)
ret <8 x i8> %x
@@ -3542,17 +3530,17 @@ define <16 x i8> @test_signed_v16f64_v16i8(<16 x double> %f) {
define <8 x i16> @test_signed_v8f64_v8i16(<8 x double> %f) {
; CHECK-LABEL: test_signed_v8f64_v8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov d4, v0.d[1]
+; CHECK-NEXT: mov d4, v3.d[1]
; CHECK-NEXT: mov w8, #32767
-; CHECK-NEXT: fcvtzs w10, d0
+; CHECK-NEXT: fcvtzs w10, d3
; CHECK-NEXT: mov w11, #-32768
-; CHECK-NEXT: mov d0, v2.d[1]
-; CHECK-NEXT: fcvtzs w13, d1
-; CHECK-NEXT: fcvtzs w15, d3
+; CHECK-NEXT: mov d3, v1.d[1]
+; CHECK-NEXT: fcvtzs w13, d2
+; CHECK-NEXT: fcvtzs w15, d1
+; CHECK-NEXT: mov d1, v0.d[1]
; CHECK-NEXT: fcvtzs w9, d4
-; CHECK-NEXT: mov d4, v1.d[1]
-; CHECK-NEXT: mov d1, v3.d[1]
-; CHECK-NEXT: fcvtzs w14, d0
+; CHECK-NEXT: mov d4, v2.d[1]
+; CHECK-NEXT: fcvtzs w14, d3
; CHECK-NEXT: cmp w9, w8
; CHECK-NEXT: csel w9, w9, w8, lt
; CHECK-NEXT: fcvtzs w12, d4
@@ -3567,49 +3555,38 @@ define <8 x i16> @test_signed_v8f64_v8i16(<8 x double> %f) {
; CHECK-NEXT: cmn w12, #8, lsl #12 // =32768
; CHECK-NEXT: csel w12, w12, w11, gt
; CHECK-NEXT: cmp w13, w8
-; CHECK-NEXT: fmov s0, w10
-; CHECK-NEXT: csel w10, w13, w8, lt
-; CHECK-NEXT: cmn w10, #8, lsl #12 // =32768
-; CHECK-NEXT: fcvtzs w13, d2
-; CHECK-NEXT: csel w10, w10, w11, gt
-; CHECK-NEXT: cmp w14, w8
-; CHECK-NEXT: mov v0.s[1], w9
-; CHECK-NEXT: csel w9, w14, w8, lt
-; CHECK-NEXT: cmn w9, #8, lsl #12 // =32768
-; CHECK-NEXT: fmov s2, w10
-; CHECK-NEXT: csel w9, w9, w11, gt
-; CHECK-NEXT: cmp w13, w8
-; CHECK-NEXT: mov w10, v0.s[1]
; CHECK-NEXT: csel w13, w13, w8, lt
-; CHECK-NEXT: mov v2.s[1], w12
+; CHECK-NEXT: fmov s5, w10
; CHECK-NEXT: cmn w13, #8, lsl #12 // =32768
-; CHECK-NEXT: fcvtzs w12, d1
; CHECK-NEXT: csel w13, w13, w11, gt
-; CHECK-NEXT: mov v0.h[1], w10
-; CHECK-NEXT: fmov w14, s2
-; CHECK-NEXT: cmp w12, w8
-; CHECK-NEXT: fmov s1, w13
-; CHECK-NEXT: csel w12, w12, w8, lt
-; CHECK-NEXT: cmn w12, #8, lsl #12 // =32768
-; CHECK-NEXT: mov w10, v2.s[1]
-; CHECK-NEXT: mov v0.h[2], w14
-; CHECK-NEXT: csel w12, w12, w11, gt
+; CHECK-NEXT: cmp w14, w8
+; CHECK-NEXT: csel w14, w14, w8, lt
+; CHECK-NEXT: cmn w14, #8, lsl #12 // =32768
+; CHECK-NEXT: csel w10, w14, w11, gt
; CHECK-NEXT: cmp w15, w8
-; CHECK-NEXT: mov v1.s[1], w9
+; CHECK-NEXT: fcvtzs w14, d1
+; CHECK-NEXT: csel w15, w15, w8, lt
+; CHECK-NEXT: cmn w15, #8, lsl #12 // =32768
+; CHECK-NEXT: mov v5.s[1], w9
+; CHECK-NEXT: csel w9, w15, w11, gt
+; CHECK-NEXT: cmp w14, w8
+; CHECK-NEXT: fcvtzs w15, d0
+; CHECK-NEXT: fmov s4, w13
+; CHECK-NEXT: csel w13, w14, w8, lt
+; CHECK-NEXT: cmn w13, #8, lsl #12 // =32768
+; CHECK-NEXT: csel w13, w13, w11, gt
+; CHECK-NEXT: cmp w15, w8
+; CHECK-NEXT: mov v4.s[1], w12
; CHECK-NEXT: csel w8, w15, w8, lt
+; CHECK-NEXT: fmov s3, w9
; CHECK-NEXT: cmn w8, #8, lsl #12 // =32768
; CHECK-NEXT: csel w8, w8, w11, gt
-; CHECK-NEXT: mov v0.h[3], w10
-; CHECK-NEXT: fmov w9, s1
+; CHECK-NEXT: mov v3.s[1], w10
; CHECK-NEXT: fmov s2, w8
-; CHECK-NEXT: mov w8, v1.s[1]
-; CHECK-NEXT: mov v0.h[4], w9
-; CHECK-NEXT: mov v2.s[1], w12
-; CHECK-NEXT: mov v0.h[5], w8
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov w9, v2.s[1]
-; CHECK-NEXT: mov v0.h[6], w8
-; CHECK-NEXT: mov v0.h[7], w9
+; CHECK-NEXT: adrp x8, .LCPI84_0
+; CHECK-NEXT: mov v2.s[1], w13
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI84_0]
+; CHECK-NEXT: tbl v0.16b, { v2.16b, v3.16b, v4.16b, v5.16b }, v0.16b
; CHECK-NEXT: ret
%x = call <8 x i16> @llvm.fptosi.sat.v8f64.v8i16(<8 x double> %f)
ret <8 x i16> %x
@@ -3618,140 +3595,116 @@ define <8 x i16> @test_signed_v8f64_v8i16(<8 x double> %f) {
define <16 x i16> @test_signed_v16f64_v16i16(<16 x double> %f) {
; CHECK-LABEL: test_signed_v16f64_v16i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov d16, v0.d[1]
+; CHECK-NEXT: mov d16, v3.d[1]
; CHECK-NEXT: mov w9, #32767
-; CHECK-NEXT: fcvtzs w11, d0
+; CHECK-NEXT: fcvtzs w11, d3
; CHECK-NEXT: mov w8, #-32768
-; CHECK-NEXT: mov d0, v2.d[1]
-; CHECK-NEXT: fcvtzs w12, d1
+; CHECK-NEXT: mov d3, v1.d[1]
; CHECK-NEXT: fcvtzs w14, d2
-; CHECK-NEXT: mov d2, v4.d[1]
+; CHECK-NEXT: fcvtzs w15, d1
+; CHECK-NEXT: mov d1, v7.d[1]
; CHECK-NEXT: fcvtzs w10, d16
-; CHECK-NEXT: mov d16, v1.d[1]
-; CHECK-NEXT: mov d1, v3.d[1]
-; CHECK-NEXT: fcvtzs w16, d3
-; CHECK-NEXT: fcvtzs w15, d0
-; CHECK-NEXT: mov d3, v6.d[1]
+; CHECK-NEXT: mov d16, v2.d[1]
+; CHECK-NEXT: mov d2, v0.d[1]
+; CHECK-NEXT: fcvtzs w18, d0
+; CHECK-NEXT: mov d0, v6.d[1]
+; CHECK-NEXT: fcvtzs w0, d7
; CHECK-NEXT: cmp w10, w9
+; CHECK-NEXT: fcvtzs w2, d6
; CHECK-NEXT: csel w10, w10, w9, lt
-; CHECK-NEXT: fcvtzs w13, d16
+; CHECK-NEXT: fcvtzs w12, d16
; CHECK-NEXT: cmn w10, #8, lsl #12 // =32768
-; CHECK-NEXT: fcvtzs w17, d1
+; CHECK-NEXT: fcvtzs w17, d2
; CHECK-NEXT: csel w10, w10, w8, gt
; CHECK-NEXT: cmp w11, w9
; CHECK-NEXT: csel w11, w11, w9, lt
-; CHECK-NEXT: mov d1, v5.d[1]
+; CHECK-NEXT: fcvtzs w1, d0
+; CHECK-NEXT: cmn w11, #8, lsl #12 // =32768
+; CHECK-NEXT: mov d0, v4.d[1]
+; CHECK-NEXT: csel w13, w11, w8, gt
+; CHECK-NEXT: cmp w12, w9
+; CHECK-NEXT: csel w11, w12, w9, lt
+; CHECK-NEXT: fcvtzs w12, d3
; CHECK-NEXT: cmn w11, #8, lsl #12 // =32768
; CHECK-NEXT: csel w11, w11, w8, gt
-; CHECK-NEXT: cmp w13, w9
-; CHECK-NEXT: csel w13, w13, w9, lt
-; CHECK-NEXT: cmn w13, #8, lsl #12 // =32768
-; CHECK-NEXT: csel w13, w13, w8, gt
+; CHECK-NEXT: cmp w14, w9
+; CHECK-NEXT: csel w14, w14, w9, lt
+; CHECK-NEXT: fmov s19, w13
+; CHECK-NEXT: cmn w14, #8, lsl #12 // =32768
+; CHECK-NEXT: csel w14, w14, w8, gt
; CHECK-NEXT: cmp w12, w9
; CHECK-NEXT: csel w12, w12, w9, lt
-; CHECK-NEXT: fmov s0, w11
; CHECK-NEXT: cmn w12, #8, lsl #12 // =32768
; CHECK-NEXT: csel w12, w12, w8, gt
; CHECK-NEXT: cmp w15, w9
; CHECK-NEXT: csel w15, w15, w9, lt
; CHECK-NEXT: cmn w15, #8, lsl #12 // =32768
-; CHECK-NEXT: csel w11, w15, w8, gt
-; CHECK-NEXT: cmp w14, w9
-; CHECK-NEXT: csel w14, w14, w9, lt
-; CHECK-NEXT: fcvtzs w15, d4
-; CHECK-NEXT: cmn w14, #8, lsl #12 // =32768
-; CHECK-NEXT: csel w14, w14, w8, gt
+; CHECK-NEXT: csel w16, w15, w8, gt
; CHECK-NEXT: cmp w17, w9
-; CHECK-NEXT: mov v0.s[1], w10
-; CHECK-NEXT: csel w10, w17, w9, lt
-; CHECK-NEXT: cmn w10, #8, lsl #12 // =32768
-; CHECK-NEXT: fcvtzs w17, d2
-; CHECK-NEXT: csel w10, w10, w8, gt
-; CHECK-NEXT: cmp w16, w9
-; CHECK-NEXT: fmov s2, w12
-; CHECK-NEXT: csel w12, w16, w9, lt
-; CHECK-NEXT: cmn w12, #8, lsl #12 // =32768
-; CHECK-NEXT: mov w16, v0.s[1]
-; CHECK-NEXT: csel w12, w12, w8, gt
-; CHECK-NEXT: cmp w17, w9
-; CHECK-NEXT: mov v2.s[1], w13
-; CHECK-NEXT: csel w13, w17, w9, lt
-; CHECK-NEXT: cmn w13, #8, lsl #12 // =32768
+; CHECK-NEXT: csel w15, w17, w9, lt
; CHECK-NEXT: fcvtzs w17, d1
-; CHECK-NEXT: csel w13, w13, w8, gt
-; CHECK-NEXT: cmp w15, w9
-; CHECK-NEXT: csel w15, w15, w9, lt
-; CHECK-NEXT: fmov s4, w14
; CHECK-NEXT: cmn w15, #8, lsl #12 // =32768
-; CHECK-NEXT: mov v0.h[1], w16
-; CHECK-NEXT: fcvtzs w16, d5
+; CHECK-NEXT: mov d1, v5.d[1]
; CHECK-NEXT: csel w15, w15, w8, gt
+; CHECK-NEXT: cmp w18, w9
+; CHECK-NEXT: csel w18, w18, w9, lt
+; CHECK-NEXT: cmn w18, #8, lsl #12 // =32768
+; CHECK-NEXT: csel w18, w18, w8, gt
; CHECK-NEXT: cmp w17, w9
; CHECK-NEXT: csel w17, w17, w9, lt
; CHECK-NEXT: cmn w17, #8, lsl #12 // =32768
-; CHECK-NEXT: csel w14, w17, w8, gt
-; CHECK-NEXT: cmp w16, w9
-; CHECK-NEXT: fmov s1, w15
-; CHECK-NEXT: csel w15, w16, w9, lt
-; CHECK-NEXT: fcvtzs w16, d3
-; CHECK-NEXT: cmn w15, #8, lsl #12 // =32768
-; CHECK-NEXT: mov v4.s[1], w11
-; CHECK-NEXT: csel w11, w15, w8, gt
-; CHECK-NEXT: fcvtzs w15, d6
-; CHECK-NEXT: mov v1.s[1], w13
-; CHECK-NEXT: cmp w16, w9
-; CHECK-NEXT: fmov s3, w11
-; CHECK-NEXT: csel w16, w16, w9, lt
-; CHECK-NEXT: fmov w11, s2
-; CHECK-NEXT: mov w13, v2.s[1]
-; CHECK-NEXT: mov d2, v7.d[1]
-; CHECK-NEXT: cmn w16, #8, lsl #12 // =32768
-; CHECK-NEXT: csel w16, w16, w8, gt
-; CHECK-NEXT: cmp w15, w9
-; CHECK-NEXT: mov v0.h[2], w11
-; CHECK-NEXT: csel w11, w15, w9, lt
-; CHECK-NEXT: mov w15, v1.s[1]
-; CHECK-NEXT: cmn w11, #8, lsl #12 // =32768
-; CHECK-NEXT: mov v3.s[1], w14
-; CHECK-NEXT: fcvtzs w14, d2
-; CHECK-NEXT: csel w11, w11, w8, gt
-; CHECK-NEXT: mov v0.h[3], w13
-; CHECK-NEXT: mov v1.h[1], w15
+; CHECK-NEXT: csel w17, w17, w8, gt
+; CHECK-NEXT: cmp w0, w9
+; CHECK-NEXT: csel w0, w0, w9, lt
+; CHECK-NEXT: cmn w0, #8, lsl #12 // =32768
+; CHECK-NEXT: csel w13, w0, w8, gt
+; CHECK-NEXT: cmp w1, w9
+; CHECK-NEXT: csel w1, w1, w9, lt
+; CHECK-NEXT: fcvtzs w0, d1
+; CHECK-NEXT: cmn w1, #8, lsl #12 // =32768
+; CHECK-NEXT: mov v19.s[1], w10
+; CHECK-NEXT: csel w10, w1, w8, gt
+; CHECK-NEXT: cmp w2, w9
+; CHECK-NEXT: fcvtzs w1, d5
+; CHECK-NEXT: csel w2, w2, w9, lt
+; CHECK-NEXT: fmov s18, w14
+; CHECK-NEXT: cmn w2, #8, lsl #12 // =32768
+; CHECK-NEXT: fmov s23, w13
+; CHECK-NEXT: csel w2, w2, w8, gt
+; CHECK-NEXT: cmp w0, w9
+; CHECK-NEXT: csel w14, w0, w9, lt
+; CHECK-NEXT: cmn w14, #8, lsl #12 // =32768
+; CHECK-NEXT: csel w13, w14, w8, gt
+; CHECK-NEXT: cmp w1, w9
+; CHECK-NEXT: fcvtzs w14, d0
+; CHECK-NEXT: csel w0, w1, w9, lt
+; CHECK-NEXT: cmn w0, #8, lsl #12 // =32768
+; CHECK-NEXT: mov v18.s[1], w11
+; CHECK-NEXT: csel w11, w0, w8, gt
+; CHECK-NEXT: mov v23.s[1], w17
; CHECK-NEXT: cmp w14, w9
-; CHECK-NEXT: fmov w13, s3
+; CHECK-NEXT: fcvtzs w17, d4
; CHECK-NEXT: csel w14, w14, w9, lt
-; CHECK-NEXT: fcvtzs w15, d7
-; CHECK-NEXT: fmov s2, w11
+; CHECK-NEXT: fmov s22, w2
; CHECK-NEXT: cmn w14, #8, lsl #12 // =32768
-; CHECK-NEXT: mov w11, v3.s[1]
-; CHECK-NEXT: mov v1.h[2], w13
-; CHECK-NEXT: csel w13, w14, w8, gt
-; CHECK-NEXT: cmp w15, w9
-; CHECK-NEXT: fmov s3, w12
-; CHECK-NEXT: mov v2.s[1], w16
-; CHECK-NEXT: csel w9, w15, w9, lt
+; CHECK-NEXT: csel w14, w14, w8, gt
+; CHECK-NEXT: fmov s17, w16
+; CHECK-NEXT: cmp w17, w9
+; CHECK-NEXT: mov v22.s[1], w10
+; CHECK-NEXT: csel w9, w17, w9, lt
+; CHECK-NEXT: fmov s21, w11
; CHECK-NEXT: cmn w9, #8, lsl #12 // =32768
-; CHECK-NEXT: fmov w12, s4
; CHECK-NEXT: csel w8, w9, w8, gt
-; CHECK-NEXT: mov w14, v4.s[1]
-; CHECK-NEXT: mov v1.h[3], w11
-; CHECK-NEXT: fmov w11, s2
-; CHECK-NEXT: mov w9, v2.s[1]
-; CHECK-NEXT: fmov s2, w8
-; CHECK-NEXT: mov v0.h[4], w12
-; CHECK-NEXT: mov v1.h[4], w11
-; CHECK-NEXT: mov v3.s[1], w10
-; CHECK-NEXT: mov v2.s[1], w13
-; CHECK-NEXT: mov v0.h[5], w14
-; CHECK-NEXT: mov v1.h[5], w9
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: mov w10, v3.s[1]
-; CHECK-NEXT: mov w11, v2.s[1]
-; CHECK-NEXT: mov v0.h[6], w8
-; CHECK-NEXT: mov v1.h[6], w9
-; CHECK-NEXT: mov v0.h[7], w10
-; CHECK-NEXT: mov v1.h[7], w11
+; CHECK-NEXT: adrp x9, .LCPI85_0
+; CHECK-NEXT: mov v17.s[1], w12
+; CHECK-NEXT: mov v21.s[1], w13
+; CHECK-NEXT: fmov s16, w18
+; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI85_0]
+; CHECK-NEXT: fmov s20, w8
+; CHECK-NEXT: mov v16.s[1], w15
+; CHECK-NEXT: mov v20.s[1], w14
+; CHECK-NEXT: tbl v0.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v1.16b
+; CHECK-NEXT: tbl v1.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v1.16b
; CHECK-NEXT: ret
%x = call <16 x i16> @llvm.fptosi.sat.v16f64.v16i16(<16 x double> %f)
ret <16 x i16> %x
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index d8d4b6f8b98cb..acd92c5e3ccd1 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -2768,58 +2768,46 @@ define <16 x i16> @test_unsigned_v16f16_v16i16(<16 x half> %f) {
define <8 x i8> @test_unsigned_v8f64_v8i8(<8 x double> %f) {
; CHECK-LABEL: test_unsigned_v8f64_v8i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov d5, v0.d[1]
-; CHECK-NEXT: fcvtzu w10, d0
-; CHECK-NEXT: mov d0, v1.d[1]
+; CHECK-NEXT: mov d4, v3.d[1]
+; CHECK-NEXT: fcvtzu w10, d3
+; CHECK-NEXT: mov d3, v2.d[1]
; CHECK-NEXT: mov w8, #255
-; CHECK-NEXT: fcvtzu w12, d1
-; CHECK-NEXT: mov d4, v2.d[1]
-; CHECK-NEXT: fcvtzu w13, d3
-; CHECK-NEXT: fcvtzu w9, d5
-; CHECK-NEXT: fcvtzu w11, d0
+; CHECK-NEXT: fcvtzu w12, d2
+; CHECK-NEXT: fcvtzu w13, d1
+; CHECK-NEXT: fcvtzu w9, d4
+; CHECK-NEXT: mov d4, v1.d[1]
+; CHECK-NEXT: fcvtzu w11, d3
+; CHECK-NEXT: mov d1, v0.d[1]
; CHECK-NEXT: cmp w9, #255
; CHECK-NEXT: csel w9, w9, w8, lo
; CHECK-NEXT: cmp w10, #255
; CHECK-NEXT: csel w10, w10, w8, lo
; CHECK-NEXT: cmp w11, #255
-; CHECK-NEXT: fmov s0, w10
-; CHECK-NEXT: csel w10, w11, w8, lo
+; CHECK-NEXT: csel w11, w11, w8, lo
; CHECK-NEXT: cmp w12, #255
-; CHECK-NEXT: csel w11, w12, w8, lo
-; CHECK-NEXT: mov v0.s[1], w9
-; CHECK-NEXT: fcvtzu w9, d4
-; CHECK-NEXT: fmov s1, w11
-; CHECK-NEXT: fcvtzu w11, d2
+; CHECK-NEXT: csel w12, w12, w8, lo
+; CHECK-NEXT: fmov s19, w10
+; CHECK-NEXT: fcvtzu w10, d4
+; CHECK-NEXT: cmp w10, #255
+; CHECK-NEXT: mov v19.s[1], w9
+; CHECK-NEXT: csel w10, w10, w8, lo
+; CHECK-NEXT: cmp w13, #255
+; CHECK-NEXT: fmov s18, w12
+; CHECK-NEXT: fcvtzu w9, d1
+; CHECK-NEXT: csel w12, w13, w8, lo
+; CHECK-NEXT: fcvtzu w13, d0
+; CHECK-NEXT: mov v18.s[1], w11
; CHECK-NEXT: cmp w9, #255
-; CHECK-NEXT: mov d2, v3.d[1]
-; CHECK-NEXT: mov w12, v0.s[1]
+; CHECK-NEXT: fmov s17, w12
; CHECK-NEXT: csel w9, w9, w8, lo
-; CHECK-NEXT: mov v1.s[1], w10
-; CHECK-NEXT: cmp w11, #255
-; CHECK-NEXT: csel w10, w11, w8, lo
-; CHECK-NEXT: mov v0.b[1], w12
-; CHECK-NEXT: fmov w11, s1
-; CHECK-NEXT: fmov s4, w10
-; CHECK-NEXT: fcvtzu w10, d2
-; CHECK-NEXT: mov w12, v1.s[1]
-; CHECK-NEXT: mov v0.b[2], w11
-; CHECK-NEXT: mov v4.s[1], w9
-; CHECK-NEXT: cmp w10, #255
-; CHECK-NEXT: csel w9, w10, w8, lo
; CHECK-NEXT: cmp w13, #255
; CHECK-NEXT: csel w8, w13, w8, lo
-; CHECK-NEXT: mov v0.b[3], w12
-; CHECK-NEXT: fmov w10, s4
-; CHECK-NEXT: fmov s1, w8
-; CHECK-NEXT: mov w8, v4.s[1]
-; CHECK-NEXT: mov v0.b[4], w10
-; CHECK-NEXT: mov v1.s[1], w9
-; CHECK-NEXT: mov v0.b[5], w8
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov w9, v1.s[1]
-; CHECK-NEXT: mov v0.b[6], w8
-; CHECK-NEXT: mov v0.b[7], w9
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: mov v17.s[1], w10
+; CHECK-NEXT: fmov s16, w8
+; CHECK-NEXT: adrp x8, .LCPI82_0
+; CHECK-NEXT: mov v16.s[1], w9
+; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI82_0]
+; CHECK-NEXT: tbl v0.8b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.8b
; CHECK-NEXT: ret
%x = call <8 x i8> @llvm.fptoui.sat.v8f64.v8i8(<8 x double> %f)
ret <8 x i8> %x
@@ -2939,57 +2927,46 @@ define <16 x i8> @test_unsigned_v16f64_v16i8(<16 x double> %f) {
define <8 x i16> @test_unsigned_v8f64_v8i16(<8 x double> %f) {
; CHECK-LABEL: test_unsigned_v8f64_v8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov d5, v0.d[1]
-; CHECK-NEXT: fcvtzu w10, d0
-; CHECK-NEXT: mov d0, v1.d[1]
+; CHECK-NEXT: mov d4, v3.d[1]
+; CHECK-NEXT: fcvtzu w10, d3
+; CHECK-NEXT: mov d3, v2.d[1]
; CHECK-NEXT: mov w8, #65535
-; CHECK-NEXT: fcvtzu w12, d1
-; CHECK-NEXT: mov d4, v2.d[1]
-; CHECK-NEXT: fcvtzu w13, d3
-; CHECK-NEXT: fcvtzu w9, d5
-; CHECK-NEXT: fcvtzu w11, d0
+; CHECK-NEXT: fcvtzu w12, d2
+; CHECK-NEXT: fcvtzu w13, d1
+; CHECK-NEXT: fcvtzu w9, d4
+; CHECK-NEXT: mov d4, v1.d[1]
+; CHECK-NEXT: fcvtzu w11, d3
+; CHECK-NEXT: mov d1, v0.d[1]
; CHECK-NEXT: cmp w9, w8
; CHECK-NEXT: csel w9, w9, w8, lo
; CHECK-NEXT: cmp w10, w8
; CHECK-NEXT: csel w10, w10, w8, lo
; CHECK-NEXT: cmp w11, w8
-; CHECK-NEXT: fmov s0, w10
-; CHECK-NEXT: csel w10, w11, w8, lo
+; CHECK-NEXT: csel w11, w11, w8, lo
; CHECK-NEXT: cmp w12, w8
-; CHECK-NEXT: csel w11, w12, w8, lo
-; CHECK-NEXT: mov v0.s[1], w9
-; CHECK-NEXT: fcvtzu w9, d4
-; CHECK-NEXT: fmov s1, w11
-; CHECK-NEXT: fcvtzu w11, d2
+; CHECK-NEXT: csel w12, w12, w8, lo
+; CHECK-NEXT: fmov s19, w10
+; CHECK-NEXT: fcvtzu w10, d4
+; CHECK-NEXT: cmp w10, w8
+; CHECK-NEXT: mov v19.s[1], w9
+; CHECK-NEXT: csel w10, w10, w8, lo
+; CHECK-NEXT: cmp w13, w8
+; CHECK-NEXT: fmov s18, w12
+; CHECK-NEXT: fcvtzu w9, d1
+; CHECK-NEXT: csel w12, w13, w8, lo
+; CHECK-NEXT: fcvtzu w13, d0
+; CHECK-NEXT: mov v18.s[1], w11
; CHECK-NEXT: cmp w9, w8
-; CHECK-NEXT: mov d2, v3.d[1]
-; CHECK-NEXT: mov w12, v0.s[1]
+; CHECK-NEXT: fmov s17, w12
; CHECK-NEXT: csel w9, w9, w8, lo
-; CHECK-NEXT: mov v1.s[1], w10
-; CHECK-NEXT: cmp w11, w8
-; CHECK-NEXT: csel w10, w11, w8, lo
-; CHECK-NEXT: mov v0.h[1], w12
-; CHECK-NEXT: fmov w11, s1
-; CHECK-NEXT: fmov s4, w10
-; CHECK-NEXT: fcvtzu w10, d2
-; CHECK-NEXT: mov w12, v1.s[1]
-; CHECK-NEXT: mov v0.h[2], w11
-; CHECK-NEXT: mov v4.s[1], w9
-; CHECK-NEXT: cmp w10, w8
-; CHECK-NEXT: csel w9, w10, w8, lo
; CHECK-NEXT: cmp w13, w8
; CHECK-NEXT: csel w8, w13, w8, lo
-; CHECK-NEXT: mov v0.h[3], w12
-; CHECK-NEXT: fmov w10, s4
-; CHECK-NEXT: fmov s1, w8
-; CHECK-NEXT: mov w8, v4.s[1]
-; CHECK-NEXT: mov v0.h[4], w10
-; CHECK-NEXT: mov v1.s[1], w9
-; CHECK-NEXT: mov v0.h[5], w8
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov w9, v1.s[1]
-; CHECK-NEXT: mov v0.h[6], w8
-; CHECK-NEXT: mov v0.h[7], w9
+; CHECK-NEXT: mov v17.s[1], w10
+; CHECK-NEXT: fmov s16, w8
+; CHECK-NEXT: adrp x8, .LCPI84_0
+; CHECK-NEXT: mov v16.s[1], w9
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI84_0]
+; CHECK-NEXT: tbl v0.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b
; CHECK-NEXT: ret
%x = call <8 x i16> @llvm.fptoui.sat.v8f64.v8i16(<8 x double> %f)
ret <8 x i16> %x
@@ -2998,107 +2975,83 @@ define <8 x i16> @test_unsigned_v8f64_v8i16(<8 x double> %f) {
define <16 x i16> @test_unsigned_v16f64_v16i16(<16 x double> %f) {
; CHECK-LABEL: test_unsigned_v16f64_v16i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov d16, v0.d[1]
-; CHECK-NEXT: fcvtzu w9, d0
-; CHECK-NEXT: mov d0, v1.d[1]
-; CHECK-NEXT: mov d17, v2.d[1]
-; CHECK-NEXT: fcvtzu w10, d1
-; CHECK-NEXT: mov d1, v3.d[1]
+; CHECK-NEXT: mov d16, v3.d[1]
+; CHECK-NEXT: fcvtzu w9, d3
+; CHECK-NEXT: mov d3, v2.d[1]
; CHECK-NEXT: mov w8, #65535
-; CHECK-NEXT: fcvtzu w12, d2
-; CHECK-NEXT: fcvtzu w11, d16
-; CHECK-NEXT: mov d2, v4.d[1]
+; CHECK-NEXT: fcvtzu w10, d2
+; CHECK-NEXT: mov d2, v1.d[1]
+; CHECK-NEXT: fcvtzu w11, d1
+; CHECK-NEXT: mov d1, v0.d[1]
+; CHECK-NEXT: fcvtzu w12, d16
; CHECK-NEXT: fcvtzu w13, d0
-; CHECK-NEXT: fcvtzu w14, d17
-; CHECK-NEXT: fcvtzu w15, d1
-; CHECK-NEXT: fcvtzu w16, d3
-; CHECK-NEXT: cmp w11, w8
-; CHECK-NEXT: mov d1, v5.d[1]
-; CHECK-NEXT: csel w11, w11, w8, lo
+; CHECK-NEXT: fcvtzu w14, d3
+; CHECK-NEXT: mov d0, v7.d[1]
+; CHECK-NEXT: fcvtzu w15, d2
+; CHECK-NEXT: fcvtzu w17, d6
+; CHECK-NEXT: cmp w12, w8
+; CHECK-NEXT: fcvtzu w16, d1
+; CHECK-NEXT: csel w12, w12, w8, lo
; CHECK-NEXT: cmp w9, w8
; CHECK-NEXT: csel w9, w9, w8, lo
-; CHECK-NEXT: cmp w13, w8
-; CHECK-NEXT: csel w13, w13, w8, lo
-; CHECK-NEXT: cmp w10, w8
-; CHECK-NEXT: csel w10, w10, w8, lo
; CHECK-NEXT: cmp w14, w8
; CHECK-NEXT: csel w14, w14, w8, lo
-; CHECK-NEXT: cmp w12, w8
-; CHECK-NEXT: csel w12, w12, w8, lo
+; CHECK-NEXT: cmp w10, w8
+; CHECK-NEXT: csel w10, w10, w8, lo
; CHECK-NEXT: cmp w15, w8
-; CHECK-NEXT: fcvtzu w17, d2
-; CHECK-NEXT: fmov s0, w9
+; CHECK-NEXT: fmov s19, w9
; CHECK-NEXT: csel w9, w15, w8, lo
-; CHECK-NEXT: fcvtzu w15, d4
-; CHECK-NEXT: cmp w16, w8
-; CHECK-NEXT: fcvtzu w18, d1
-; CHECK-NEXT: csel w16, w16, w8, lo
-; CHECK-NEXT: cmp w17, w8
-; CHECK-NEXT: csel w17, w17, w8, lo
-; CHECK-NEXT: cmp w15, w8
-; CHECK-NEXT: mov v0.s[1], w11
-; CHECK-NEXT: fcvtzu w0, d5
-; CHECK-NEXT: csel w11, w15, w8, lo
-; CHECK-NEXT: fmov s2, w10
-; CHECK-NEXT: cmp w18, w8
-; CHECK-NEXT: mov d4, v6.d[1]
-; CHECK-NEXT: csel w10, w18, w8, lo
-; CHECK-NEXT: cmp w0, w8
-; CHECK-NEXT: fmov s1, w11
-; CHECK-NEXT: csel w11, w0, w8, lo
-; CHECK-NEXT: mov v2.s[1], w13
-; CHECK-NEXT: mov w13, v0.s[1]
-; CHECK-NEXT: fcvtzu w15, d4
-; CHECK-NEXT: mov v1.s[1], w17
-; CHECK-NEXT: fmov s3, w11
-; CHECK-NEXT: mov d4, v7.d[1]
-; CHECK-NEXT: mov v0.h[1], w13
-; CHECK-NEXT: fmov w11, s2
-; CHECK-NEXT: mov v3.s[1], w10
-; CHECK-NEXT: cmp w15, w8
-; CHECK-NEXT: mov w10, v1.s[1]
-; CHECK-NEXT: mov w13, v2.s[1]
-; CHECK-NEXT: fmov s2, w12
-; CHECK-NEXT: mov v0.h[2], w11
-; CHECK-NEXT: fcvtzu w11, d6
-; CHECK-NEXT: csel w12, w15, w8, lo
-; CHECK-NEXT: mov v1.h[1], w10
-; CHECK-NEXT: fmov w10, s3
; CHECK-NEXT: cmp w11, w8
+; CHECK-NEXT: fcvtzu w15, d0
+; CHECK-NEXT: mov d0, v6.d[1]
; CHECK-NEXT: csel w11, w11, w8, lo
-; CHECK-NEXT: mov v0.h[3], w13
-; CHECK-NEXT: fcvtzu w13, d7
-; CHECK-NEXT: mov v1.h[2], w10
-; CHECK-NEXT: fmov s5, w11
-; CHECK-NEXT: fcvtzu w10, d4
-; CHECK-NEXT: mov w11, v3.s[1]
-; CHECK-NEXT: mov v2.s[1], w14
-; CHECK-NEXT: fmov s3, w16
-; CHECK-NEXT: mov v5.s[1], w12
-; CHECK-NEXT: cmp w10, w8
-; CHECK-NEXT: csel w10, w10, w8, lo
+; CHECK-NEXT: mov v19.s[1], w12
+; CHECK-NEXT: cmp w16, w8
+; CHECK-NEXT: fcvtzu w12, d7
+; CHECK-NEXT: fmov s18, w10
+; CHECK-NEXT: csel w10, w16, w8, lo
; CHECK-NEXT: cmp w13, w8
-; CHECK-NEXT: csel w8, w13, w8, lo
-; CHECK-NEXT: fmov w12, s2
-; CHECK-NEXT: mov v1.h[3], w11
-; CHECK-NEXT: fmov w13, s5
-; CHECK-NEXT: mov w14, v2.s[1]
-; CHECK-NEXT: fmov s2, w8
-; CHECK-NEXT: mov w11, v5.s[1]
-; CHECK-NEXT: mov v0.h[4], w12
-; CHECK-NEXT: mov v1.h[4], w13
-; CHECK-NEXT: mov v3.s[1], w9
-; CHECK-NEXT: mov v2.s[1], w10
-; CHECK-NEXT: mov v0.h[5], w14
-; CHECK-NEXT: mov v1.h[5], w11
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: mov w10, v3.s[1]
-; CHECK-NEXT: mov w11, v2.s[1]
-; CHECK-NEXT: mov v0.h[6], w8
-; CHECK-NEXT: mov v1.h[6], w9
-; CHECK-NEXT: mov v0.h[7], w10
-; CHECK-NEXT: mov v1.h[7], w11
+; CHECK-NEXT: fcvtzu w16, d0
+; CHECK-NEXT: csel w13, w13, w8, lo
+; CHECK-NEXT: cmp w15, w8
+; CHECK-NEXT: csel w15, w15, w8, lo
+; CHECK-NEXT: cmp w12, w8
+; CHECK-NEXT: mov d0, v5.d[1]
+; CHECK-NEXT: csel w12, w12, w8, lo
+; CHECK-NEXT: cmp w16, w8
+; CHECK-NEXT: mov v18.s[1], w14
+; CHECK-NEXT: fmov s23, w12
+; CHECK-NEXT: csel w12, w16, w8, lo
+; CHECK-NEXT: cmp w17, w8
+; CHECK-NEXT: fcvtzu w16, d0
+; CHECK-NEXT: mov d0, v4.d[1]
+; CHECK-NEXT: csel w14, w17, w8, lo
+; CHECK-NEXT: fcvtzu w17, d5
+; CHECK-NEXT: fmov s17, w11
+; CHECK-NEXT: mov v23.s[1], w15
+; CHECK-NEXT: cmp w16, w8
+; CHECK-NEXT: fmov s22, w14
+; CHECK-NEXT: csel w14, w16, w8, lo
+; CHECK-NEXT: cmp w17, w8
+; CHECK-NEXT: fcvtzu w16, d0
+; CHECK-NEXT: csel w15, w17, w8, lo
+; CHECK-NEXT: fcvtzu w11, d4
+; CHECK-NEXT: mov v22.s[1], w12
+; CHECK-NEXT: cmp w16, w8
+; CHECK-NEXT: fmov s21, w15
+; CHECK-NEXT: csel w12, w16, w8, lo
+; CHECK-NEXT: cmp w11, w8
+; CHECK-NEXT: csel w8, w11, w8, lo
+; CHECK-NEXT: mov v17.s[1], w9
+; CHECK-NEXT: adrp x9, .LCPI85_0
+; CHECK-NEXT: mov v21.s[1], w14
+; CHECK-NEXT: fmov s16, w13
+; CHECK-NEXT: fmov s20, w8
+; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI85_0]
+; CHECK-NEXT: mov v16.s[1], w10
+; CHECK-NEXT: mov v20.s[1], w12
+; CHECK-NEXT: tbl v0.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v1.16b
+; CHECK-NEXT: tbl v1.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v1.16b
; CHECK-NEXT: ret
%x = call <16 x i16> @llvm.fptoui.sat.v16f64.v16i16(<16 x double> %f)
ret <16 x i16> %x
diff --git a/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll b/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll
index dd7dd44bedf7b..aaa7dd00419ed 100644
--- a/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll
@@ -268,36 +268,13 @@ entry:
define <16 x i8> @extract_4_v4i32_badindex(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
; CHECK-LABEL: extract_4_v4i32_badindex:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: mov w9, v0.s[2]
-; CHECK-NEXT: mov w10, v0.s[3]
-; CHECK-NEXT: mov v0.b[1], w8
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov v0.b[2], w9
-; CHECK-NEXT: mov w9, v1.s[2]
-; CHECK-NEXT: mov v0.b[3], w10
-; CHECK-NEXT: mov v0.b[4], w8
-; CHECK-NEXT: mov w8, v1.s[1]
-; CHECK-NEXT: mov v0.b[5], w9
-; CHECK-NEXT: mov w9, v1.s[3]
-; CHECK-NEXT: mov v0.b[6], w8
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov v0.b[7], w9
-; CHECK-NEXT: mov w9, v2.s[1]
-; CHECK-NEXT: mov v0.b[8], w8
-; CHECK-NEXT: mov w8, v2.s[2]
-; CHECK-NEXT: mov v0.b[9], w9
-; CHECK-NEXT: mov w9, v2.s[3]
-; CHECK-NEXT: mov v0.b[10], w8
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov v0.b[11], w9
-; CHECK-NEXT: mov w9, v3.s[1]
-; CHECK-NEXT: mov v0.b[12], w8
-; CHECK-NEXT: mov w8, v3.s[2]
-; CHECK-NEXT: mov v0.b[13], w9
-; CHECK-NEXT: mov w9, v3.s[3]
-; CHECK-NEXT: mov v0.b[14], w8
-; CHECK-NEXT: mov v0.b[15], w9
+; CHECK-NEXT: adrp x8, .LCPI5_0
+; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI5_0]
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
; CHECK-NEXT: ret
entry:
%a0 = extractelement <4 x i32> %a, i32 0
diff --git a/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll b/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll
index fe491646c921a..a3a36ca8089a8 100644
--- a/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll
+++ b/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll
@@ -1,46 +1,33 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-none-eabi < %s | FileCheck %s
+; CHECK: .LCPI0_0:
+; CHECK: .byte 0 // 0x0
+; CHECK: .byte 16 // 0x10
+; CHECK: .byte 32 // 0x20
+; CHECK: .byte 48 // 0x30
+; CHECK: .byte 2 // 0x2
+; CHECK: .byte 18 // 0x12
+; CHECK: .byte 34 // 0x22
+; CHECK: .byte 50 // 0x32
+; CHECK: .byte 4 // 0x4
+; CHECK: .byte 20 // 0x14
+; CHECK: .byte 36 // 0x24
+; CHECK: .byte 52 // 0x34
+; CHECK: .byte 6 // 0x6
+; CHECK: .byte 22 // 0x16
+; CHECK: .byte 38 // 0x26
+; CHECK: .byte 54 // 0x36
define <16 x i8> @shuffle4_v4i8_16(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
; CHECK-LABEL: shuffle4_v4i8_16:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: umov w9, v0.h[0]
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: umov w10, v1.h[0]
-; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT: umov w8, v2.h[0]
-; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
-; CHECK-NEXT: fmov s4, w9
-; CHECK-NEXT: mov v4.b[1], w10
-; CHECK-NEXT: mov v4.b[2], w8
-; CHECK-NEXT: umov w8, v3.h[0]
-; CHECK-NEXT: mov v4.b[3], w8
-; CHECK-NEXT: umov w8, v0.h[1]
-; CHECK-NEXT: mov v4.b[4], w8
-; CHECK-NEXT: umov w8, v1.h[1]
-; CHECK-NEXT: mov v4.b[5], w8
-; CHECK-NEXT: umov w8, v2.h[1]
-; CHECK-NEXT: mov v4.b[6], w8
-; CHECK-NEXT: umov w8, v3.h[1]
-; CHECK-NEXT: mov v4.b[7], w8
-; CHECK-NEXT: umov w8, v0.h[2]
-; CHECK-NEXT: mov v4.b[8], w8
-; CHECK-NEXT: umov w8, v1.h[2]
-; CHECK-NEXT: mov v4.b[9], w8
-; CHECK-NEXT: umov w8, v2.h[2]
-; CHECK-NEXT: mov v4.b[10], w8
-; CHECK-NEXT: umov w8, v3.h[2]
-; CHECK-NEXT: mov v4.b[11], w8
-; CHECK-NEXT: umov w8, v0.h[3]
-; CHECK-NEXT: mov v4.b[12], w8
-; CHECK-NEXT: umov w8, v1.h[3]
-; CHECK-NEXT: mov v4.b[13], w8
-; CHECK-NEXT: umov w8, v2.h[3]
-; CHECK-NEXT: mov v4.b[14], w8
-; CHECK-NEXT: umov w8, v3.h[3]
-; CHECK-NEXT: mov v4.b[15], w8
-; CHECK-NEXT: mov v0.16b, v4.16b
+; CHECK-NEXT: adrp x8, .LCPI0_0
+; CHECK-NEXT: // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
; CHECK-NEXT: ret
%x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%y = shufflevector <4 x i8> %c, <4 x i8> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -48,30 +35,25 @@ define <16 x i8> @shuffle4_v4i8_16(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i
ret <16 x i8> %z
}
+; CHECK: .LCPI1_0:
+; CHECK: .byte 0 // 0x0
+; CHECK: .byte 16 // 0x10
+; CHECK: .byte 32 // 0x20
+; CHECK: .byte 48 // 0x30
+; CHECK: .byte 2 // 0x2
+; CHECK: .byte 18 // 0x12
+; CHECK: .byte 34 // 0x22
+; CHECK: .byte 50 // 0x32
define <8 x i8> @shuffle4_v4i8_8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
; CHECK-LABEL: shuffle4_v4i8_8:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: umov w9, v0.h[0]
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: umov w10, v1.h[0]
-; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT: umov w8, v2.h[0]
-; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
-; CHECK-NEXT: fmov s4, w9
-; CHECK-NEXT: umov w9, v3.h[0]
-; CHECK-NEXT: mov v4.b[1], w10
-; CHECK-NEXT: mov v4.b[2], w8
-; CHECK-NEXT: umov w8, v0.h[1]
-; CHECK-NEXT: mov v4.b[3], w9
-; CHECK-NEXT: umov w9, v1.h[1]
-; CHECK-NEXT: mov v4.b[4], w8
-; CHECK-NEXT: umov w8, v2.h[1]
-; CHECK-NEXT: mov v4.b[5], w9
-; CHECK-NEXT: umov w9, v3.h[1]
-; CHECK-NEXT: mov v4.b[6], w8
-; CHECK-NEXT: mov v4.b[7], w9
-; CHECK-NEXT: fmov d0, d4
+; CHECK-NEXT: adrp x8, .LCPI1_0
+; CHECK-NEXT: // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: ldr d4, [x8, :lo12:.LCPI1_0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.8b
; CHECK-NEXT: ret
%x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%y = shufflevector <4 x i8> %c, <4 x i8> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -388,46 +370,33 @@ define <8 x i16> @shuffle4_v4i8_zext(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x
ret <8 x i16> %z
}
+; CHECK: .LCPI9_0:
+; CHECK: .byte 0 // 0x0
+; CHECK: .byte 16 // 0x10
+; CHECK: .byte 32 // 0x20
+; CHECK: .byte 48 // 0x30
+; CHECK: .byte 2 // 0x2
+; CHECK: .byte 18 // 0x12
+; CHECK: .byte 34 // 0x22
+; CHECK: .byte 50 // 0x32
+; CHECK: .byte 4 // 0x4
+; CHECK: .byte 20 // 0x14
+; CHECK: .byte 36 // 0x24
+; CHECK: .byte 52 // 0x34
+; CHECK: .byte 6 // 0x6
+; CHECK: .byte 22 // 0x16
+; CHECK: .byte 38 // 0x26
+; CHECK: .byte 54 // 0x36
define <16 x i8> @shuffle4_v4i16_trunc(<4 x i16> %ae, <4 x i16> %be, <4 x i16> %ce, <4 x i16> %de) {
; CHECK-LABEL: shuffle4_v4i16_trunc:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: umov w9, v0.h[0]
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: umov w10, v1.h[0]
-; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT: umov w8, v2.h[0]
-; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
-; CHECK-NEXT: fmov s4, w9
-; CHECK-NEXT: mov v4.b[1], w10
-; CHECK-NEXT: mov v4.b[2], w8
-; CHECK-NEXT: umov w8, v3.h[0]
-; CHECK-NEXT: mov v4.b[3], w8
-; CHECK-NEXT: umov w8, v0.h[1]
-; CHECK-NEXT: mov v4.b[4], w8
-; CHECK-NEXT: umov w8, v1.h[1]
-; CHECK-NEXT: mov v4.b[5], w8
-; CHECK-NEXT: umov w8, v2.h[1]
-; CHECK-NEXT: mov v4.b[6], w8
-; CHECK-NEXT: umov w8, v3.h[1]
-; CHECK-NEXT: mov v4.b[7], w8
-; CHECK-NEXT: umov w8, v0.h[2]
-; CHECK-NEXT: mov v4.b[8], w8
-; CHECK-NEXT: umov w8, v1.h[2]
-; CHECK-NEXT: mov v4.b[9], w8
-; CHECK-NEXT: umov w8, v2.h[2]
-; CHECK-NEXT: mov v4.b[10], w8
-; CHECK-NEXT: umov w8, v3.h[2]
-; CHECK-NEXT: mov v4.b[11], w8
-; CHECK-NEXT: umov w8, v0.h[3]
-; CHECK-NEXT: mov v4.b[12], w8
-; CHECK-NEXT: umov w8, v1.h[3]
-; CHECK-NEXT: mov v4.b[13], w8
-; CHECK-NEXT: umov w8, v2.h[3]
-; CHECK-NEXT: mov v4.b[14], w8
-; CHECK-NEXT: umov w8, v3.h[3]
-; CHECK-NEXT: mov v4.b[15], w8
-; CHECK-NEXT: mov v0.16b, v4.16b
+; CHECK-NEXT: adrp x8, .LCPI9_0
+; CHECK-NEXT: // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI9_0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
; CHECK-NEXT: ret
%a = trunc <4 x i16> %ae to <4 x i8>
%b = trunc <4 x i16> %be to <4 x i8>
@@ -439,45 +408,34 @@ define <16 x i8> @shuffle4_v4i16_trunc(<4 x i16> %ae, <4 x i16> %be, <4 x i16> %
ret <16 x i8> %z
}
+; CHECK: .LCPI10_0:
+; CHECK: .byte 0 // 0x0
+; CHECK: .byte 16 // 0x10
+; CHECK: .byte 32 // 0x20
+; CHECK: .byte 48 // 0x30
+; CHECK: .byte 2 // 0x2
+; CHECK: .byte 18 // 0x12
+; CHECK: .byte 34 // 0x22
+; CHECK: .byte 50 // 0x32
+; CHECK: .byte 4 // 0x4
+; CHECK: .byte 20 // 0x14
+; CHECK: .byte 36 // 0x24
+; CHECK: .byte 52 // 0x34
+; CHECK: .byte 6 // 0x6
+; CHECK: .byte 22 // 0x16
+; CHECK: .byte 38 // 0x26
+; CHECK: .byte 54 // 0x36
+; CHECK: .text
define <16 x i8> @shuffle4_v4i32_trunc(<4 x i32> %ae, <4 x i32> %be, <4 x i32> %ce, <4 x i32> %de) {
; CHECK-LABEL: shuffle4_v4i32_trunc:
; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI10_0
; CHECK-NEXT: xtn v4.4h, v0.4s
-; CHECK-NEXT: xtn v1.4h, v1.4s
-; CHECK-NEXT: xtn v2.4h, v2.4s
-; CHECK-NEXT: xtn v3.4h, v3.4s
-; CHECK-NEXT: umov w8, v4.h[0]
-; CHECK-NEXT: umov w9, v1.h[0]
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: umov w8, v2.h[0]
-; CHECK-NEXT: mov v0.b[1], w9
-; CHECK-NEXT: mov v0.b[2], w8
-; CHECK-NEXT: umov w8, v3.h[0]
-; CHECK-NEXT: mov v0.b[3], w8
-; CHECK-NEXT: umov w8, v4.h[1]
-; CHECK-NEXT: mov v0.b[4], w8
-; CHECK-NEXT: umov w8, v1.h[1]
-; CHECK-NEXT: mov v0.b[5], w8
-; CHECK-NEXT: umov w8, v2.h[1]
-; CHECK-NEXT: mov v0.b[6], w8
-; CHECK-NEXT: umov w8, v3.h[1]
-; CHECK-NEXT: mov v0.b[7], w8
-; CHECK-NEXT: umov w8, v4.h[2]
-; CHECK-NEXT: mov v0.b[8], w8
-; CHECK-NEXT: umov w8, v1.h[2]
-; CHECK-NEXT: mov v0.b[9], w8
-; CHECK-NEXT: umov w8, v2.h[2]
-; CHECK-NEXT: mov v0.b[10], w8
-; CHECK-NEXT: umov w8, v3.h[2]
-; CHECK-NEXT: mov v0.b[11], w8
-; CHECK-NEXT: umov w8, v4.h[3]
-; CHECK-NEXT: mov v0.b[12], w8
-; CHECK-NEXT: umov w8, v1.h[3]
-; CHECK-NEXT: mov v0.b[13], w8
-; CHECK-NEXT: umov w8, v2.h[3]
-; CHECK-NEXT: mov v0.b[14], w8
-; CHECK-NEXT: umov w8, v3.h[3]
-; CHECK-NEXT: mov v0.b[15], w8
+; CHECK-NEXT: xtn v5.4h, v1.4s
+; CHECK-NEXT: xtn v6.4h, v2.4s
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI10_0]
+; CHECK-NEXT: xtn v7.4h, v3.4s
+; CHECK-NEXT: tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b
; CHECK-NEXT: ret
%a = trunc <4 x i32> %ae to <4 x i8>
%b = trunc <4 x i32> %be to <4 x i8>
@@ -489,37 +447,32 @@ define <16 x i8> @shuffle4_v4i32_trunc(<4 x i32> %ae, <4 x i32> %be, <4 x i32> %
ret <16 x i8> %z
}
+; CHECK: .LCPI11_0:
+; CHECK: .byte 0 // 0x0
+; CHECK: .byte 16 // 0x10
+; CHECK: .byte 32 // 0x20
+; CHECK: .byte 2 // 0x2
+; CHECK: .byte 18 // 0x12
+; CHECK: .byte 34 // 0x22
+; CHECK: .byte 4 // 0x4
+; CHECK: .byte 20 // 0x14
+; CHECK: .byte 36 // 0x24
+; CHECK: .byte 6 // 0x6
+; CHECK: .byte 22 // 0x16
+; CHECK: .byte 38 // 0x26
+; CHECK: .byte 255 // 0xff
+; CHECK: .byte 255 // 0xff
+; CHECK: .byte 255 // 0xff
+; CHECK: .byte 255 // 0xff
define <12 x i8> @shuffle3_v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) {
; CHECK-LABEL: shuffle3_v4i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: umov w8, v0.h[0]
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: umov w9, v1.h[0]
-; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT: fmov s3, w8
-; CHECK-NEXT: umov w8, v2.h[0]
-; CHECK-NEXT: mov v3.b[1], w9
-; CHECK-NEXT: umov w9, v0.h[1]
-; CHECK-NEXT: mov v3.b[2], w8
-; CHECK-NEXT: umov w8, v1.h[1]
-; CHECK-NEXT: mov v3.b[3], w9
-; CHECK-NEXT: umov w9, v2.h[1]
-; CHECK-NEXT: mov v3.b[4], w8
-; CHECK-NEXT: umov w8, v0.h[2]
-; CHECK-NEXT: mov v3.b[5], w9
-; CHECK-NEXT: umov w9, v1.h[2]
-; CHECK-NEXT: mov v3.b[6], w8
-; CHECK-NEXT: umov w8, v2.h[2]
-; CHECK-NEXT: mov v3.b[7], w9
-; CHECK-NEXT: umov w9, v0.h[3]
-; CHECK-NEXT: mov v3.b[8], w8
-; CHECK-NEXT: umov w8, v1.h[3]
-; CHECK-NEXT: mov v3.b[9], w9
-; CHECK-NEXT: umov w9, v2.h[3]
-; CHECK-NEXT: mov v3.b[10], w8
-; CHECK-NEXT: mov v3.b[11], w9
-; CHECK-NEXT: mov v0.16b, v3.16b
+; CHECK-NEXT: adrp x8, .LCPI11_0
+; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI11_0]
+; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b }, v3.16b
; CHECK-NEXT: ret
%x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%y = shufflevector <4 x i8> %c, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll
index d9b610da84db8..946128cc18c0b 100644
--- a/llvm/test/CodeGen/AArch64/tbl-loops.ll
+++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll
@@ -371,59 +371,38 @@ define void @loop3(i8* noalias nocapture noundef writeonly %dst, float* nocaptur
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB2_8: // %vector.ph
; CHECK-NEXT: add x11, x8, #1
-; CHECK-NEXT: mov w13, #1132396544
+; CHECK-NEXT: adrp x12, .LCPI2_0
; CHECK-NEXT: and x10, x11, #0x1fffffffc
+; CHECK-NEXT: mov w13, #1132396544
+; CHECK-NEXT: add x8, x10, x10, lsl #1
+; CHECK-NEXT: ldr q0, [x12, :lo12:.LCPI2_0]
+; CHECK-NEXT: add x9, x0, x8
; CHECK-NEXT: mov x12, x10
-; CHECK-NEXT: add x9, x10, x10, lsl #1
-; CHECK-NEXT: dup v0.4s, w13
-; CHECK-NEXT: add x8, x1, x9, lsl #2
-; CHECK-NEXT: add x9, x0, x9
+; CHECK-NEXT: add x8, x1, x8, lsl #2
+; CHECK-NEXT: dup v1.4s, w13
; CHECK-NEXT: .LBB2_9: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ld3 { v1.4s, v2.4s, v3.4s }, [x1], #48
-; CHECK-NEXT: fcmlt v4.4s, v1.4s, #0.0
+; CHECK-NEXT: ld3 { v2.4s, v3.4s, v4.4s }, [x1], #48
+; CHECK-NEXT: fcmlt v5.4s, v2.4s, #0.0
+; CHECK-NEXT: add x13, x0, #8
+; CHECK-NEXT: fmin v6.4s, v2.4s, v1.4s
; CHECK-NEXT: subs x12, x12, #4
-; CHECK-NEXT: fmin v5.4s, v1.4s, v0.4s
-; CHECK-NEXT: fmin v6.4s, v2.4s, v0.4s
; CHECK-NEXT: fcmlt v7.4s, v3.4s, #0.0
-; CHECK-NEXT: fmin v1.4s, v3.4s, v0.4s
-; CHECK-NEXT: bic v4.16b, v5.16b, v4.16b
-; CHECK-NEXT: fcmlt v5.4s, v2.4s, #0.0
-; CHECK-NEXT: fcvtzs v4.4s, v4.4s
-; CHECK-NEXT: bic v1.16b, v1.16b, v7.16b
-; CHECK-NEXT: fcvtzs v1.4s, v1.4s
-; CHECK-NEXT: bic v2.16b, v6.16b, v5.16b
+; CHECK-NEXT: fmin v16.4s, v3.4s, v1.4s
+; CHECK-NEXT: fmin v2.4s, v4.4s, v1.4s
+; CHECK-NEXT: bic v5.16b, v6.16b, v5.16b
+; CHECK-NEXT: fcmlt v6.4s, v4.4s, #0.0
+; CHECK-NEXT: bic v3.16b, v16.16b, v7.16b
+; CHECK-NEXT: fcvtzs v4.4s, v5.4s
+; CHECK-NEXT: fcvtzs v3.4s, v3.4s
+; CHECK-NEXT: bic v2.16b, v2.16b, v6.16b
; CHECK-NEXT: fcvtzs v2.4s, v2.4s
-; CHECK-NEXT: xtn v3.4h, v4.4s
-; CHECK-NEXT: xtn v1.4h, v1.4s
-; CHECK-NEXT: umov w13, v3.h[0]
-; CHECK-NEXT: xtn v2.4h, v2.4s
-; CHECK-NEXT: umov w14, v2.h[0]
-; CHECK-NEXT: fmov s4, w13
-; CHECK-NEXT: umov w13, v1.h[0]
-; CHECK-NEXT: mov v4.b[1], w14
-; CHECK-NEXT: umov w14, v3.h[1]
-; CHECK-NEXT: mov v4.b[2], w13
-; CHECK-NEXT: umov w13, v2.h[1]
-; CHECK-NEXT: mov v4.b[3], w14
-; CHECK-NEXT: umov w14, v1.h[1]
-; CHECK-NEXT: mov v4.b[4], w13
-; CHECK-NEXT: umov w13, v3.h[2]
-; CHECK-NEXT: mov v4.b[5], w14
-; CHECK-NEXT: umov w14, v2.h[2]
-; CHECK-NEXT: mov v4.b[6], w13
-; CHECK-NEXT: umov w13, v1.h[2]
-; CHECK-NEXT: mov v4.b[7], w14
-; CHECK-NEXT: umov w14, v3.h[3]
-; CHECK-NEXT: mov v4.b[8], w13
-; CHECK-NEXT: umov w13, v2.h[3]
-; CHECK-NEXT: mov v4.b[9], w14
-; CHECK-NEXT: umov w14, v1.h[3]
-; CHECK-NEXT: mov v4.b[10], w13
-; CHECK-NEXT: add x13, x0, #8
-; CHECK-NEXT: mov v4.b[11], w14
-; CHECK-NEXT: str d4, [x0], #12
-; CHECK-NEXT: st1 { v4.s }[2], [x13]
+; CHECK-NEXT: xtn v4.4h, v4.4s
+; CHECK-NEXT: xtn v5.4h, v3.4s
+; CHECK-NEXT: xtn v6.4h, v2.4s
+; CHECK-NEXT: tbl v2.16b, { v4.16b, v5.16b, v6.16b }, v0.16b
+; CHECK-NEXT: str d2, [x0], #12
+; CHECK-NEXT: st1 { v2.s }[2], [x13]
; CHECK-NEXT: b.ne .LBB2_9
; CHECK-NEXT: // %bb.10: // %middle.block
; CHECK-NEXT: cmp x11, x10
@@ -606,69 +585,40 @@ define void @loop4(i8* noalias nocapture noundef writeonly %dst, float* nocaptur
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB3_8: // %vector.ph
; CHECK-NEXT: add x11, x8, #1
-; CHECK-NEXT: mov w13, #1132396544
+; CHECK-NEXT: adrp x12, .LCPI3_0
; CHECK-NEXT: and x10, x11, #0x1fffffffc
-; CHECK-NEXT: mov x12, x10
+; CHECK-NEXT: mov w13, #1132396544
; CHECK-NEXT: add x8, x1, x10, lsl #4
; CHECK-NEXT: add x9, x0, x10, lsl #2
-; CHECK-NEXT: dup v0.4s, w13
+; CHECK-NEXT: ldr q0, [x12, :lo12:.LCPI3_0]
+; CHECK-NEXT: mov x12, x10
+; CHECK-NEXT: dup v1.4s, w13
; CHECK-NEXT: .LBB3_9: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x1], #64
-; CHECK-NEXT: fcmlt v5.4s, v1.4s, #0.0
+; CHECK-NEXT: ld4 { v2.4s, v3.4s, v4.4s, v5.4s }, [x1], #64
+; CHECK-NEXT: fcmlt v6.4s, v2.4s, #0.0
; CHECK-NEXT: subs x12, x12, #4
-; CHECK-NEXT: fmin v6.4s, v1.4s, v0.4s
-; CHECK-NEXT: fmin v7.4s, v2.4s, v0.4s
+; CHECK-NEXT: fmin v7.4s, v2.4s, v1.4s
; CHECK-NEXT: fcmlt v16.4s, v3.4s, #0.0
-; CHECK-NEXT: fmin v17.4s, v3.4s, v0.4s
-; CHECK-NEXT: bic v5.16b, v6.16b, v5.16b
-; CHECK-NEXT: fcmlt v6.4s, v2.4s, #0.0
-; CHECK-NEXT: fcvtzs v5.4s, v5.4s
-; CHECK-NEXT: fmin v1.4s, v4.4s, v0.4s
+; CHECK-NEXT: fmin v17.4s, v3.4s, v1.4s
+; CHECK-NEXT: fmin v18.4s, v4.4s, v1.4s
; CHECK-NEXT: bic v6.16b, v7.16b, v6.16b
-; CHECK-NEXT: fcvtzs v6.4s, v6.4s
-; CHECK-NEXT: xtn v5.4h, v5.4s
-; CHECK-NEXT: bic v7.16b, v17.16b, v16.16b
-; CHECK-NEXT: fcmlt v16.4s, v4.4s, #0.0
-; CHECK-NEXT: umov w13, v5.h[0]
-; CHECK-NEXT: xtn v2.4h, v6.4s
-; CHECK-NEXT: fcvtzs v3.4s, v7.4s
-; CHECK-NEXT: umov w14, v2.h[0]
-; CHECK-NEXT: bic v1.16b, v1.16b, v16.16b
-; CHECK-NEXT: fmov s4, w13
-; CHECK-NEXT: xtn v3.4h, v3.4s
-; CHECK-NEXT: fcvtzs v1.4s, v1.4s
-; CHECK-NEXT: mov v4.b[1], w14
-; CHECK-NEXT: umov w13, v3.h[0]
-; CHECK-NEXT: xtn v1.4h, v1.4s
-; CHECK-NEXT: mov v4.b[2], w13
-; CHECK-NEXT: umov w13, v1.h[0]
-; CHECK-NEXT: mov v4.b[3], w13
-; CHECK-NEXT: umov w13, v5.h[1]
-; CHECK-NEXT: mov v4.b[4], w13
-; CHECK-NEXT: umov w13, v2.h[1]
-; CHECK-NEXT: mov v4.b[5], w13
-; CHECK-NEXT: umov w13, v3.h[1]
-; CHECK-NEXT: mov v4.b[6], w13
-; CHECK-NEXT: umov w13, v1.h[1]
-; CHECK-NEXT: mov v4.b[7], w13
-; CHECK-NEXT: umov w13, v5.h[2]
-; CHECK-NEXT: mov v4.b[8], w13
-; CHECK-NEXT: umov w13, v2.h[2]
-; CHECK-NEXT: mov v4.b[9], w13
-; CHECK-NEXT: umov w13, v3.h[2]
-; CHECK-NEXT: mov v4.b[10], w13
-; CHECK-NEXT: umov w13, v1.h[2]
-; CHECK-NEXT: mov v4.b[11], w13
-; CHECK-NEXT: umov w13, v5.h[3]
-; CHECK-NEXT: mov v4.b[12], w13
-; CHECK-NEXT: umov w13, v2.h[3]
-; CHECK-NEXT: mov v4.b[13], w13
-; CHECK-NEXT: umov w13, v3.h[3]
-; CHECK-NEXT: mov v4.b[14], w13
-; CHECK-NEXT: umov w13, v1.h[3]
-; CHECK-NEXT: mov v4.b[15], w13
-; CHECK-NEXT: str q4, [x0], #16
+; CHECK-NEXT: fcmlt v7.4s, v4.4s, #0.0
+; CHECK-NEXT: bic v16.16b, v17.16b, v16.16b
+; CHECK-NEXT: fcmlt v17.4s, v5.4s, #0.0
+; CHECK-NEXT: fmin v2.4s, v5.4s, v1.4s
+; CHECK-NEXT: fcvtzs v4.4s, v6.4s
+; CHECK-NEXT: bic v3.16b, v18.16b, v7.16b
+; CHECK-NEXT: fcvtzs v5.4s, v16.4s
+; CHECK-NEXT: fcvtzs v3.4s, v3.4s
+; CHECK-NEXT: bic v2.16b, v2.16b, v17.16b
+; CHECK-NEXT: fcvtzs v2.4s, v2.4s
+; CHECK-NEXT: xtn v16.4h, v4.4s
+; CHECK-NEXT: xtn v17.4h, v5.4s
+; CHECK-NEXT: xtn v18.4h, v3.4s
+; CHECK-NEXT: xtn v19.4h, v2.4s
+; CHECK-NEXT: tbl v2.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b
+; CHECK-NEXT: str q2, [x0], #16
; CHECK-NEXT: b.ne .LBB3_9
; CHECK-NEXT: // %bb.10: // %middle.block
; CHECK-NEXT: cmp x11, x10
More information about the llvm-commits
mailing list