[llvm] [AArch64][SVE] Support lowering fixed-length BUILD_VECTORS to ZIPs (PR #111698)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 10 06:16:56 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Benjamin Maxwell (MacDue)
<details>
<summary>Changes</summary>
This allows lowering fixed-length (non-constant) BUILD_VECTORS (<= 128-bit) to a chain of ZIP1 instructions when Neon is not available, rather than using the default lowering, which is to spill to the stack and reload.
For example,
```
t5: v4f32 = BUILD_VECTOR(t0, t1, t2, t3)
```
Becomes:
```
zip1 z0.s, z0.s, z1.s // z0 = t0,t1,...
zip1 z2.s, z2.s, z3.s // z2 = t2,t3,...
zip1 z0.d, z0.d, z2.d // z0 = t0,t1,t2,t3,...
```
When values are already in FRPs, this generally seems to lead to a more compact output with less movement to/from the stack.
---
Patch is 138.60 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/111698.diff
19 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+62-13)
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.h (+1)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll (+94-168)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll (+3-5)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll (+1-8)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll (+252)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll (+23-47)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll (+22-22)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll (+8-24)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll (+21-31)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll (+320-372)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll (+2-9)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll (+2-6)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll (+8-16)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll (+22-31)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll (+30-52)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll (+19-39)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll (+442-690)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll (+4-12)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 288fd3639e5eb7..bb2a7587849c59 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -25,6 +25,7 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallVectorExtras.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
@@ -2102,7 +2103,7 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
setOperationAction(ISD::BITREVERSE, VT, Default);
setOperationAction(ISD::BSWAP, VT, Default);
- setOperationAction(ISD::BUILD_VECTOR, VT, Default);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::CONCAT_VECTORS, VT, Default);
setOperationAction(ISD::CTLZ, VT, Default);
setOperationAction(ISD::CTPOP, VT, Default);
@@ -14384,24 +14385,72 @@ static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG,
return SDValue();
}
-SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
+ SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+ auto *BVN = cast<BuildVectorSDNode>(Op);
- if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
- if (auto SeqInfo = cast<BuildVectorSDNode>(Op)->isConstantSequence()) {
- SDLoc DL(Op);
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
- SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
- SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
- SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
- return convertFromScalableVector(DAG, Op.getValueType(), Seq);
- }
+ if (auto SeqInfo = BVN->isConstantSequence()) {
+ SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
+ SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
+ SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
+ return convertFromScalableVector(DAG, VT, Seq);
+ }
+
+ unsigned NumElems = VT.getVectorNumElements();
+ if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 ||
+ NumElems <= 1 || BVN->isConstant())
+ return SDValue();
+
+ auto IsExtractElt = [](SDValue Op) {
+ return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
+ };
- // Revert to common legalisation for all other variants.
+ // For integer types that are not already in vectors limit to at most four
+ // elements. This is an arbitrary restriction to avoid many fmovs from GPRs.
+ if (VT.getScalarType().isInteger() &&
+ NumElems - count_if(Op->op_values(), IsExtractElt) > 4)
return SDValue();
+
+ // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
+ SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
+ SmallVector<SDValue, 16> Intermediates = llvm::map_to_vector<16>(
+ Op->op_values(), [&, Undef = DAG.getUNDEF(ContainerVT)](SDValue Op) {
+ return Op.isUndef() ? Undef
+ : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
+ ContainerVT, Undef, Op, ZeroI64);
+ });
+
+ ElementCount ZipEC = ContainerVT.getVectorElementCount();
+ while (Intermediates.size() > 1) {
+ EVT ZipVT = getPackedSVEVectorVT(ZipEC);
+
+ for (unsigned I = 0; I < Intermediates.size(); I += 2) {
+ SDValue Op0 = DAG.getBitcast(ZipVT, Intermediates[I + 0]);
+ SDValue Op1 = DAG.getBitcast(ZipVT, Intermediates[I + 1]);
+ Intermediates[I / 2] =
+ Op1.isUndef() ? Op0
+ : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);
+ }
+
+ Intermediates.resize(Intermediates.size() / 2);
+ ZipEC = ZipEC.divideCoefficientBy(2);
}
+ assert(Intermediates.size() == 1);
+ SDValue Vec = DAG.getBitcast(ContainerVT, Intermediates[0]);
+ return convertFromScalableVector(DAG, VT, Vec);
+}
+
+SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+
+ if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
+ return LowerFixedLengthBuildVectorToSVE(Op, DAG);
+
// Try to build a simple constant vector.
Op = NormalizeBuildVector(Op, DAG);
// Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 1bae7562f459a5..95489f85631801 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1256,6 +1256,7 @@ class AArch64TargetLowering : public TargetLowering {
SDValue LowerFixedLengthFPToIntToSVE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFixedLengthVECTOR_SHUFFLEToSVE(SDValue Op,
SelectionDAG &DAG) const;
+ SDValue LowerFixedLengthBuildVectorToSVE(SDValue Op, SelectionDAG &DAG) const;
SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
SmallVectorImpl<SDNode *> &Created) const override;
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
index 276f23703df3df..20659cde83ee00 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
@@ -140,98 +140,65 @@ define <8 x i8> @shuffle_index_indices_from_both_ops(ptr %a, ptr %b) {
;
; SVE2_128_NOMAX-LABEL: shuffle_index_indices_from_both_ops:
; SVE2_128_NOMAX: // %bb.0:
-; SVE2_128_NOMAX-NEXT: sub sp, sp, #16
-; SVE2_128_NOMAX-NEXT: .cfi_def_cfa_offset 16
; SVE2_128_NOMAX-NEXT: ldr d0, [x1]
-; SVE2_128_NOMAX-NEXT: mov z1.b, z0.b[7]
-; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[6]
-; SVE2_128_NOMAX-NEXT: mov z3.b, z0.b[4]
-; SVE2_128_NOMAX-NEXT: fmov w8, s1
; SVE2_128_NOMAX-NEXT: ldr d1, [x0]
-; SVE2_128_NOMAX-NEXT: fmov w9, s2
; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[3]
-; SVE2_128_NOMAX-NEXT: mov z1.b, z1.b[1]
-; SVE2_128_NOMAX-NEXT: strb w8, [sp, #15]
-; SVE2_128_NOMAX-NEXT: fmov w8, s3
; SVE2_128_NOMAX-NEXT: mov z3.b, z0.b[2]
-; SVE2_128_NOMAX-NEXT: strb w9, [sp, #14]
-; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[1]
-; SVE2_128_NOMAX-NEXT: fmov w9, s2
-; SVE2_128_NOMAX-NEXT: strb w8, [sp, #13]
-; SVE2_128_NOMAX-NEXT: strb w8, [sp, #12]
-; SVE2_128_NOMAX-NEXT: fmov w8, s3
-; SVE2_128_NOMAX-NEXT: strb w9, [sp, #11]
-; SVE2_128_NOMAX-NEXT: fmov w9, s0
-; SVE2_128_NOMAX-NEXT: strb w8, [sp, #10]
-; SVE2_128_NOMAX-NEXT: fmov w8, s1
-; SVE2_128_NOMAX-NEXT: strb w9, [sp, #9]
-; SVE2_128_NOMAX-NEXT: strb w8, [sp, #8]
-; SVE2_128_NOMAX-NEXT: ldr d0, [sp, #8]
-; SVE2_128_NOMAX-NEXT: add sp, sp, #16
+; SVE2_128_NOMAX-NEXT: mov z4.b, z0.b[1]
+; SVE2_128_NOMAX-NEXT: mov z1.b, z1.b[1]
+; SVE2_128_NOMAX-NEXT: mov z5.b, z0.b[7]
+; SVE2_128_NOMAX-NEXT: mov z6.b, z0.b[6]
+; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[4]
+; SVE2_128_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b
+; SVE2_128_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b
+; SVE2_128_NOMAX-NEXT: zip1 z3.b, z6.b, z5.b
+; SVE2_128_NOMAX-NEXT: zip1 z0.b, z0.b, z0.b
+; SVE2_128_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h
+; SVE2_128_NOMAX-NEXT: zip1 z0.h, z0.h, z3.h
+; SVE2_128_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s
+; SVE2_128_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE2_128_NOMAX-NEXT: ret
;
; SVE2_NOMIN_NOMAX-LABEL: shuffle_index_indices_from_both_ops:
; SVE2_NOMIN_NOMAX: // %bb.0:
-; SVE2_NOMIN_NOMAX-NEXT: sub sp, sp, #16
-; SVE2_NOMIN_NOMAX-NEXT: .cfi_def_cfa_offset 16
; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [x1]
-; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z0.b[7]
-; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[6]
-; SVE2_NOMIN_NOMAX-NEXT: mov z3.b, z0.b[4]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s1
; SVE2_NOMIN_NOMAX-NEXT: ldr d1, [x0]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s2
; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[3]
-; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z1.b[1]
-; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #15]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s3
; SVE2_NOMIN_NOMAX-NEXT: mov z3.b, z0.b[2]
-; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #14]
-; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[1]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s2
-; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #13]
-; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #12]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s3
-; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #11]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s0
-; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #10]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s1
-; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #9]
-; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #8]
-; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [sp, #8]
-; SVE2_NOMIN_NOMAX-NEXT: add sp, sp, #16
+; SVE2_NOMIN_NOMAX-NEXT: mov z4.b, z0.b[1]
+; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z1.b[1]
+; SVE2_NOMIN_NOMAX-NEXT: mov z5.b, z0.b[7]
+; SVE2_NOMIN_NOMAX-NEXT: mov z6.b, z0.b[6]
+; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[4]
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z3.b, z6.b, z5.b
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.b, z0.b, z0.b
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.h, z0.h, z3.h
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s
+; SVE2_NOMIN_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE2_NOMIN_NOMAX-NEXT: ret
;
; SVE2_MIN_256_NOMAX-LABEL: shuffle_index_indices_from_both_ops:
; SVE2_MIN_256_NOMAX: // %bb.0:
-; SVE2_MIN_256_NOMAX-NEXT: sub sp, sp, #16
-; SVE2_MIN_256_NOMAX-NEXT: .cfi_def_cfa_offset 16
; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [x1]
-; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z0.b[7]
-; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[6]
-; SVE2_MIN_256_NOMAX-NEXT: mov z3.b, z0.b[4]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s1
; SVE2_MIN_256_NOMAX-NEXT: ldr d1, [x0]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s2
; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[3]
-; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z1.b[1]
-; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #15]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s3
; SVE2_MIN_256_NOMAX-NEXT: mov z3.b, z0.b[2]
-; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #14]
-; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[1]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s2
-; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #13]
-; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #12]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s3
-; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #11]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s0
-; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #10]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s1
-; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #9]
-; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #8]
-; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [sp, #8]
-; SVE2_MIN_256_NOMAX-NEXT: add sp, sp, #16
+; SVE2_MIN_256_NOMAX-NEXT: mov z4.b, z0.b[1]
+; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z1.b[1]
+; SVE2_MIN_256_NOMAX-NEXT: mov z5.b, z0.b[7]
+; SVE2_MIN_256_NOMAX-NEXT: mov z6.b, z0.b[6]
+; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[4]
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z3.b, z6.b, z5.b
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.b, z0.b, z0.b
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.h, z0.h, z3.h
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s
+; SVE2_MIN_256_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE2_MIN_256_NOMAX-NEXT: ret
%op1 = load <8 x i8>, ptr %a
%op2 = load <8 x i8>, ptr %b
@@ -263,89 +230,59 @@ define <8 x i8> @shuffle_index_poison_value(ptr %a, ptr %b) {
;
; SVE2_128_NOMAX-LABEL: shuffle_index_poison_value:
; SVE2_128_NOMAX: // %bb.0:
-; SVE2_128_NOMAX-NEXT: sub sp, sp, #16
-; SVE2_128_NOMAX-NEXT: .cfi_def_cfa_offset 16
; SVE2_128_NOMAX-NEXT: ldr d0, [x1]
-; SVE2_128_NOMAX-NEXT: ldr d3, [x0]
-; SVE2_128_NOMAX-NEXT: mov z1.b, z0.b[6]
-; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[4]
-; SVE2_128_NOMAX-NEXT: fmov w8, s1
-; SVE2_128_NOMAX-NEXT: mov z1.b, z0.b[3]
-; SVE2_128_NOMAX-NEXT: fmov w9, s2
-; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[2]
-; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[1]
-; SVE2_128_NOMAX-NEXT: strb w8, [sp, #14]
-; SVE2_128_NOMAX-NEXT: fmov w8, s1
-; SVE2_128_NOMAX-NEXT: mov z1.b, z3.b[1]
-; SVE2_128_NOMAX-NEXT: strb w9, [sp, #13]
-; SVE2_128_NOMAX-NEXT: strb w9, [sp, #12]
-; SVE2_128_NOMAX-NEXT: fmov w9, s2
-; SVE2_128_NOMAX-NEXT: strb w8, [sp, #11]
-; SVE2_128_NOMAX-NEXT: fmov w8, s0
-; SVE2_128_NOMAX-NEXT: strb w9, [sp, #10]
-; SVE2_128_NOMAX-NEXT: fmov w9, s1
-; SVE2_128_NOMAX-NEXT: strb w8, [sp, #9]
-; SVE2_128_NOMAX-NEXT: strb w9, [sp, #8]
-; SVE2_128_NOMAX-NEXT: ldr d0, [sp, #8]
-; SVE2_128_NOMAX-NEXT: add sp, sp, #16
+; SVE2_128_NOMAX-NEXT: ldr d1, [x0]
+; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[3]
+; SVE2_128_NOMAX-NEXT: mov z3.b, z0.b[2]
+; SVE2_128_NOMAX-NEXT: mov z4.b, z0.b[1]
+; SVE2_128_NOMAX-NEXT: mov z1.b, z1.b[1]
+; SVE2_128_NOMAX-NEXT: mov z5.b, z0.b[4]
+; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[6]
+; SVE2_128_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b
+; SVE2_128_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b
+; SVE2_128_NOMAX-NEXT: zip1 z3.b, z5.b, z5.b
+; SVE2_128_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h
+; SVE2_128_NOMAX-NEXT: zip1 z0.h, z3.h, z0.h
+; SVE2_128_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s
+; SVE2_128_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE2_128_NOMAX-NEXT: ret
;
; SVE2_NOMIN_NOMAX-LABEL: shuffle_index_poison_value:
; SVE2_NOMIN_NOMAX: // %bb.0:
-; SVE2_NOMIN_NOMAX-NEXT: sub sp, sp, #16
-; SVE2_NOMIN_NOMAX-NEXT: .cfi_def_cfa_offset 16
; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [x1]
-; SVE2_NOMIN_NOMAX-NEXT: ldr d3, [x0]
-; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z0.b[6]
-; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[4]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s1
-; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z0.b[3]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s2
-; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[2]
-; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[1]
-; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #14]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s1
-; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z3.b[1]
-; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #13]
-; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #12]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s2
-; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #11]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s0
-; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #10]
-; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s1
-; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #9]
-; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #8]
-; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [sp, #8]
-; SVE2_NOMIN_NOMAX-NEXT: add sp, sp, #16
+; SVE2_NOMIN_NOMAX-NEXT: ldr d1, [x0]
+; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[3]
+; SVE2_NOMIN_NOMAX-NEXT: mov z3.b, z0.b[2]
+; SVE2_NOMIN_NOMAX-NEXT: mov z4.b, z0.b[1]
+; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z1.b[1]
+; SVE2_NOMIN_NOMAX-NEXT: mov z5.b, z0.b[4]
+; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[6]
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z3.b, z5.b, z5.b
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.h, z3.h, z0.h
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s
+; SVE2_NOMIN_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE2_NOMIN_NOMAX-NEXT: ret
;
; SVE2_MIN_256_NOMAX-LABEL: shuffle_index_poison_value:
; SVE2_MIN_256_NOMAX: // %bb.0:
-; SVE2_MIN_256_NOMAX-NEXT: sub sp, sp, #16
-; SVE2_MIN_256_NOMAX-NEXT: .cfi_def_cfa_offset 16
; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [x1]
-; SVE2_MIN_256_NOMAX-NEXT: ldr d3, [x0]
-; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z0.b[6]
-; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[4]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s1
-; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z0.b[3]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s2
-; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[2]
-; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[1]
-; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #14]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s1
-; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z3.b[1]
-; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #13]
-; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #12]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s2
-; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #11]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s0
-; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #10]
-; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s1
-; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #9]
-; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #8]
-; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [sp, #8]
-; SVE2_MIN_256_NOMAX-NEXT: add sp, sp, #16
+; SVE2_MIN_256_NOMAX-NEXT: ldr d1, [x0]
+; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[3]
+; SVE2_MIN_256_NOMAX-NEXT: mov z3.b, z0.b[2]
+; SVE2_MIN_256_NOMAX-NEXT: mov z4.b, z0.b[1]
+; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z1.b[1]
+; SVE2_MIN_256_NOMAX-NEXT: mov z5.b, z0.b[4]
+; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[6]
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z3.b, z5.b, z5.b
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.h, z3.h, z0.h
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s
+; SVE2_MIN_256_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE2_MIN_256_NOMAX-NEXT: ret
%op1 = load <8 x i8>, ptr %a
%op2 = load <8 x i8>, ptr %b
@@ -401,34 +338,23 @@ define <8 x i8> @shuffle_op1_poison(ptr %a, ptr %b) {
define <8 x i8> @negative_test_shuffle_index_size_op_both_maxhw(ptr %a, ptr %b) "target-features"="+sve2" vscale_range(16,16) {
; CHECK-LABEL: negative_test_shuffle_index_size_op_both_maxhw:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldr d0, [x1]
-; CHECK-NEXT: mov z1.b, z0.b[7]
-; CHECK-NEXT: mov z2.b, z0.b[6]
-; CHECK-NEXT: mov z3.b, z0.b[4]
-; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: ldr d1, [x0]
-; CHECK-NEXT: fmov w9, s2
; CHECK-NEXT: mov z2.b, z0.b[3]
-; CHECK-NEXT: mov z1.b, z1.b[1]
-; CHECK-NEXT: strb w8, [sp, #15]
-; CHECK-NEXT: fmov w8, s3
; CHECK-NEXT: mov z3.b, z0.b[2]
-; CHECK-NEXT: strb w9, [sp, #14]
-; CHECK-NEXT: mov z0.b, z0.b[1]
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: strb w8, [sp, #13]
-; CHECK-NEXT: strb w8, [sp, #12]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strb w9, [sp, #11]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: strb w8, [sp, #10]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strb w9, [sp, #9]
-; CHECK-NEXT: strb w8, [sp, #8]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: mov z4.b, z0.b[1]
+; CHECK-NEXT: mov z1.b, z1.b[1]
+; CHECK-NEXT: mov z5.b, z0.b[7]
+; CHECK-NEXT: mov z6.b, z0.b[6]
+; CHECK-NEXT: mov z0.b, z0.b[4]
+; CHECK-NEXT: zip1 z2.b, z3.b, z2.b
+; CHECK-NEXT: zip1 z1.b, z1.b, z4.b
+; CHECK-NEXT: zip1 z3.b, z6.b, z5.b
+; CHECK-NEXT: zip1 z0.b, z0.b, z0.b
+; CHECK-NEXT: zip1 z1.h, z1.h, z2.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT: zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%op1 = load <8 x i8>, ptr %a
%op2 = load <8 x i8>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
index 617b560713c3ab..478072d33d8c9b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
@@...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/111698
More information about the llvm-commits
mailing list