[llvm] [AArch64] Improve lowering of truncating build vectors (PR #81960)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 15 18:11:27 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Usman Nadeem (UsmanNadeem)
<details>
<summary>Changes</summary>
1. Look through assert_zext/sext nodes.
2. Generalize `ReconstructTruncateFromBuildVector` to work for more cases.
Change-Id: I717a7471986ea4961c71df62912f8dd6f1723118
---
Patch is 85.23 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/81960.diff
11 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+119-45)
- (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.td (+1)
- (modified) llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll (+8-13)
- (modified) llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll (+2-3)
- (modified) llvm/test/CodeGen/AArch64/fptoi.ll (+160-322)
- (modified) llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll (+264-289)
- (modified) llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll (+201-226)
- (modified) llvm/test/CodeGen/AArch64/neon-extracttruncate.ll (+2-14)
- (modified) llvm/test/CodeGen/AArch64/shuffle-tbl34.ll (+14-38)
- (modified) llvm/test/CodeGen/AArch64/trunc-v1i64.ll (+1-1)
- (modified) llvm/test/CodeGen/AArch64/vcvt-oversize.ll (+2-3)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8c5a4cdae11634..353509a1c1efa9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11369,54 +11369,105 @@ static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
return true;
}
-// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
-// v4i32s. This is really a truncate, which we can construct out of (legal)
-// concats and truncate nodes.
-static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) {
- if (V.getValueType() != MVT::v16i8)
- return SDValue();
- assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
-
- for (unsigned X = 0; X < 4; X++) {
- // Check the first item in each group is an extract from lane 0 of a v4i32
- // or v4i16.
- SDValue BaseExt = V.getOperand(X * 4);
- if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
- (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
- BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
- !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
- BaseExt.getConstantOperandVal(1) != 0)
+// Detect patterns like a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3, that
+// are truncates, which we can construct out of (legal) concats and truncate
+// nodes.
+static SDValue ReconstructTruncateFromBuildVector(SDValue V,
+ SelectionDAG &DAG) {
+ EVT BVTy = V.getValueType();
+ if (BVTy != MVT::v16i8 && BVTy != MVT::v8i16 && BVTy != MVT::v8i8 &&
+ BVTy != MVT::v4i16)
+ return SDValue();
+
+ // Only handle truncating BVs.
+ if (V.getOperand(0).getValueType().getSizeInBits() ==
+ BVTy.getScalarSizeInBits())
+ return SDValue();
+
+ SmallVector<SDValue, 4> Sources;
+ uint64_t LastIdx = 0;
+ uint64_t MaxIdx = 0;
+ // Check for sequential indices e.g. i=0, i+1, ..., i=0, i+1, ...
+ for (SDValue Extr : V->ops()) {
+ SDValue SourceVec = Extr.getOperand(0);
+ EVT SourceVecTy = SourceVec.getValueType();
+
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(SourceVecTy))
return SDValue();
- SDValue Base = BaseExt.getOperand(0);
- // And check the other items are extracts from the same vector.
- for (unsigned Y = 1; Y < 4; Y++) {
- SDValue Ext = V.getOperand(X * 4 + Y);
- if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
- Ext.getOperand(0) != Base ||
- !isa<ConstantSDNode>(Ext.getOperand(1)) ||
- Ext.getConstantOperandVal(1) != Y)
+ if (!isa<ConstantSDNode>(Extr.getOperand(1)))
+ return SDValue();
+
+ uint64_t CurIdx = Extr.getConstantOperandVal(1);
+ // Allow repeat of sources.
+ if (CurIdx == 0) {
+ // Check if all lanes are used by the BV.
+ if (Sources.size() && Sources[Sources.size() - 1]
+ .getValueType()
+ .getVectorMinNumElements() != LastIdx + 1)
return SDValue();
- }
+ Sources.push_back(SourceVec);
+ } else if (CurIdx != LastIdx + 1)
+ return SDValue();
+
+ LastIdx = CurIdx;
+ MaxIdx = std::max(MaxIdx, CurIdx);
}
- // Turn the buildvector into a series of truncates and concates, which will
- // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
- // concat together to produce 2 v8i16. These are both truncated and concat
- // together.
+ // Check if all lanes are used by the BV.
+ if (Sources[Sources.size() - 1].getValueType().getVectorMinNumElements() !=
+ LastIdx + 1)
+ return SDValue();
+ if (Sources.size() % 2 != 0)
+ return SDValue();
+
+ // At this point we know that we have a truncating BV of extract_vector_elt.
+ // We can just truncate and concat them.
SDLoc DL(V);
- SDValue Trunc[4] = {
- V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
- V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
- for (SDValue &V : Trunc)
- if (V.getValueType() == MVT::v4i32)
- V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
- SDValue Concat0 =
- DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
- SDValue Concat1 =
- DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
- SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
- SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
+ LLVMContext &Ctx = *DAG.getContext();
+ while (Sources.size() > 1) {
+ for (unsigned i = 0; i < Sources.size(); i += 2) {
+ SDValue V1 = Sources[i];
+ SDValue V2 = Sources[i + 1];
+ EVT VT1 = V1.getValueType();
+ EVT VT2 = V2.getValueType();
+
+ if (VT1.is128BitVector()) {
+ VT1 = VT1.changeVectorElementType(
+ VT1.getVectorElementType().getHalfSizedIntegerVT(Ctx));
+ V1 = DAG.getNode(ISD::TRUNCATE, DL, VT1, V1);
+ }
+ if (VT2.is128BitVector()) {
+ VT2 = VT2.changeVectorElementType(
+ VT2.getVectorElementType().getHalfSizedIntegerVT(Ctx));
+ V2 = DAG.getNode(ISD::TRUNCATE, DL, VT2, V2);
+ }
+
+ assert(VT1 == VT2 && "Mismatched types.");
+ Sources[i / 2] =
+ DAG.getNode(ISD::CONCAT_VECTORS, DL,
+ VT1.getDoubleNumVectorElementsVT(Ctx), V1, V2);
+ }
+ Sources.resize(Sources.size() / 2);
+ }
+
+ // We might not have the final type in some cases e.g. <4i32, 4i32> -> 8i8. Do
+ // a final truncating shuffle instead of a concat + trunc.
+ if (Sources[0].getValueType() != BVTy) {
+ SDValue V1 = Sources[0].getOperand(0);
+ SDValue V2 = Sources[0].getOperand(1);
+ V1 = DAG.getNode(DAG.getDataLayout().isLittleEndian() ? ISD::BITCAST
+ : AArch64ISD::NVCAST,
+ DL, BVTy, V1);
+ V2 = DAG.getNode(DAG.getDataLayout().isLittleEndian() ? ISD::BITCAST
+ : AArch64ISD::NVCAST,
+ DL, BVTy, V2);
+
+ SmallVector<int, 8> MaskVec;
+ for (unsigned i = 0; i < BVTy.getVectorNumElements() * 2; i += 2)
+ MaskVec.push_back(i);
+ return DAG.getVectorShuffle(BVTy, DL, V1, V2, MaskVec);
+ }
+ return Sources[0];
}
/// Check if a vector shuffle corresponds to a DUP instructions with a larger
@@ -13305,8 +13356,9 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
// v4i32s. This is really a truncate, which we can construct out of (legal)
// concats and truncate nodes.
- if (SDValue M = ReconstructTruncateFromBuildVector(Op, DAG))
- return M;
+ if (AllLanesExtractElt)
+ if (SDValue M = ReconstructTruncateFromBuildVector(Op, DAG))
+ return M;
// Empirical tests suggest this is rarely worth it for vectors of length <= 2.
if (NumElts >= 4) {
@@ -19096,6 +19148,28 @@ static SDValue performBuildVectorCombine(SDNode *N,
SDLoc DL(N);
EVT VT = N->getValueType(0);
+ // BUILD_VECTOR (extract_elt(Assert[S|Z]ext(x)))
+ // => BUILD_VECTOR (extract_elt(x))
+ SmallVector<SDValue, 8> Ops;
+ bool ExtractExtended = false;
+ for (SDValue Extr : N->ops()) {
+ if (Extr.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
+ ExtractExtended = false;
+ break;
+ }
+ SDValue ExtractBase = Extr.getOperand(0);
+ if (ExtractBase.getOpcode() == ISD::AssertSext ||
+ ExtractBase.getOpcode() == ISD::AssertZext) {
+ ExtractExtended = true;
+ Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+ Extr.getValueType(), ExtractBase.getOperand(0),
+ Extr.getOperand(1)));
+ } else
+ Ops.push_back(Extr);
+ }
+ if (ExtractExtended)
+ return DAG.getBuildVector(VT, DL, Ops);
+
// A build vector of two extracted elements is equivalent to an
// extract subvector where the inner vector is any-extended to the
// extract_vector_elt VT.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 8c2a852850320f..331eaa6fb24fda 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6114,6 +6114,7 @@ def : Pat<(v8i16 (concat_vectors (v4i16 (trunc (v4i32 V128:$Vn))),
def : Pat<(v4i32 (concat_vectors (v2i32 (trunc (v2i64 V128:$Vn))),
(v2i32 (trunc (v2i64 V128:$Vm))))),
(UZP1v4i32 V128:$Vn, V128:$Vm)>;
+
// These are the same as above, with an optional assertzext node that can be
// generated from fptoi lowering.
def : Pat<(v16i8 (concat_vectors (v8i8 (assertzext (trunc (v8i16 V128:$Vn)))),
diff --git a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
index 9bf638f57a5120..193e3b0cfbc7bc 100644
--- a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
@@ -8,9 +8,8 @@ define <4 x i16> @fptosi_v4f64_to_v4i16(ptr %ptr) {
; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-NEXT: xtn v1.2s, v1.2d
-; CHECK-NEXT: xtn v0.2s, v0.2d
-; CHECK-NEXT: uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: ret
%tmp1 = load <4 x double>, ptr %ptr
%tmp2 = fptosi <4 x double> %tmp1 to <4 x i16>
@@ -26,13 +25,10 @@ define <8 x i8> @fptosi_v4f64_to_v4i8(ptr %ptr) {
; CHECK-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-NEXT: fcvtzs v3.2d, v3.2d
; CHECK-NEXT: fcvtzs v2.2d, v2.2d
-; CHECK-NEXT: xtn v0.2s, v0.2d
-; CHECK-NEXT: xtn v1.2s, v1.2d
-; CHECK-NEXT: xtn v3.2s, v3.2d
-; CHECK-NEXT: xtn v2.2s, v2.2d
-; CHECK-NEXT: uzp1 v0.4h, v1.4h, v0.4h
-; CHECK-NEXT: uzp1 v1.4h, v2.4h, v3.4h
-; CHECK-NEXT: uzp1 v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: uzp1 v1.4s, v2.4s, v3.4s
+; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: ret
%tmp1 = load <8 x double>, ptr %ptr
%tmp2 = fptosi <8 x double> %tmp1 to <8 x i8>
@@ -72,9 +68,8 @@ define <4 x i16> @fptoui_v4f64_to_v4i16(ptr %ptr) {
; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-NEXT: xtn v1.2s, v1.2d
-; CHECK-NEXT: xtn v0.2s, v0.2d
-; CHECK-NEXT: uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: ret
%tmp1 = load <4 x double>, ptr %ptr
%tmp2 = fptoui <4 x double> %tmp1 to <4 x i16>
diff --git a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
index 1ea87bb6b04b51..0a3b9a070c2b32 100644
--- a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
@@ -73,9 +73,8 @@ define void @fptoui_v8f32_to_v8i8_no_loop(ptr %A, ptr %dst) {
; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: fcvtzs.4s v1, v1
; CHECK-NEXT: fcvtzs.4s v0, v0
-; CHECK-NEXT: xtn.4h v1, v1
-; CHECK-NEXT: xtn.4h v0, v0
-; CHECK-NEXT: uzp1.8b v0, v0, v1
+; CHECK-NEXT: uzp1.8h v0, v0, v1
+; CHECK-NEXT: xtn.8b v0, v0
; CHECK-NEXT: str d0, [x1]
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll
index 251719c1e3b430..a099db47655558 100644
--- a/llvm/test/CodeGen/AArch64/fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/fptoi.ll
@@ -1096,30 +1096,17 @@ entry:
}
define <3 x i16> @fptos_v3f64_v3i16(<3 x double> %a) {
-; CHECK-SD-LABEL: fptos_v3f64_v3i16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-SD-NEXT: fcvtzs v1.2d, v2.2d
-; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: fptos_v3f64_v3i16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT: fcvtzs v1.2d, v2.2d
-; CHECK-GI-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT: xtn v0.4h, v0.4s
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: fptos_v3f64_v3i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: fcvtzs v1.2d, v2.2d
+; CHECK-NEXT: fcvtzs v0.2d, v0.2d
+; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: ret
entry:
%c = fptosi <3 x double> %a to <3 x i16>
ret <3 x i16> %c
@@ -1134,9 +1121,8 @@ define <3 x i16> @fptou_v3f64_v3i16(<3 x double> %a) {
; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-SD-NEXT: fcvtzs v1.2d, v2.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: xtn v0.4h, v0.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptou_v3f64_v3i16:
@@ -1160,9 +1146,8 @@ define <4 x i16> @fptos_v4f64_v4i16(<4 x double> %a) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: xtn v0.4h, v0.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptos_v4f64_v4i16:
@@ -1182,9 +1167,8 @@ define <4 x i16> @fptou_v4f64_v4i16(<4 x double> %a) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: xtn v0.4h, v0.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptou_v4f64_v4i16:
@@ -1204,15 +1188,11 @@ define <8 x i16> @fptos_v8f64_v8i16(<8 x double> %a) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fcvtzs v3.2d, v3.2d
; CHECK-SD-NEXT: fcvtzs v2.2d, v2.2d
-; CHECK-SD-NEXT: adrp x8, .LCPI54_0
; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v6.2s, v3.2d
-; CHECK-SD-NEXT: xtn v5.2s, v2.2d
-; CHECK-SD-NEXT: xtn v4.2s, v1.2d
-; CHECK-SD-NEXT: xtn v3.2s, v0.2d
-; CHECK-SD-NEXT: ldr q0, [x8, :lo12:.LCPI54_0]
-; CHECK-SD-NEXT: tbl v0.16b, { v3.16b, v4.16b, v5.16b, v6.16b }, v0.16b
+; CHECK-SD-NEXT: uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptos_v8f64_v8i16:
@@ -1235,15 +1215,11 @@ define <8 x i16> @fptou_v8f64_v8i16(<8 x double> %a) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fcvtzs v3.2d, v3.2d
; CHECK-SD-NEXT: fcvtzs v2.2d, v2.2d
-; CHECK-SD-NEXT: adrp x8, .LCPI55_0
; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v6.2s, v3.2d
-; CHECK-SD-NEXT: xtn v5.2s, v2.2d
-; CHECK-SD-NEXT: xtn v4.2s, v1.2d
-; CHECK-SD-NEXT: xtn v3.2s, v0.2d
-; CHECK-SD-NEXT: ldr q0, [x8, :lo12:.LCPI55_0]
-; CHECK-SD-NEXT: tbl v0.16b, { v3.16b, v4.16b, v5.16b, v6.16b }, v0.16b
+; CHECK-SD-NEXT: uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptou_v8f64_v8i16:
@@ -1265,25 +1241,19 @@ define <16 x i16> @fptos_v16f64_v16i16(<16 x double> %a) {
; CHECK-SD-LABEL: fptos_v16f64_v16i16:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fcvtzs v3.2d, v3.2d
-; CHECK-SD-NEXT: fcvtzs v7.2d, v7.2d
-; CHECK-SD-NEXT: adrp x8, .LCPI56_0
; CHECK-SD-NEXT: fcvtzs v2.2d, v2.2d
-; CHECK-SD-NEXT: fcvtzs v6.2d, v6.2d
; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d
-; CHECK-SD-NEXT: fcvtzs v5.2d, v5.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
+; CHECK-SD-NEXT: fcvtzs v7.2d, v7.2d
+; CHECK-SD-NEXT: fcvtzs v6.2d, v6.2d
+; CHECK-SD-NEXT: fcvtzs v5.2d, v5.2d
; CHECK-SD-NEXT: fcvtzs v4.2d, v4.2d
-; CHECK-SD-NEXT: xtn v19.2s, v3.2d
-; CHECK-SD-NEXT: xtn v23.2s, v7.2d
-; CHECK-SD-NEXT: xtn v18.2s, v2.2d
-; CHECK-SD-NEXT: xtn v22.2s, v6.2d
-; CHECK-SD-NEXT: xtn v17.2s, v1.2d
-; CHECK-SD-NEXT: xtn v21.2s, v5.2d
-; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI56_0]
-; CHECK-SD-NEXT: xtn v16.2s, v0.2d
-; CHECK-SD-NEXT: xtn v20.2s, v4.2d
-; CHECK-SD-NEXT: tbl v0.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v1.16b
-; CHECK-SD-NEXT: tbl v1.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v1.16b
+; CHECK-SD-NEXT: uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: uzp1 v1.4s, v6.4s, v7.4s
+; CHECK-SD-NEXT: uzp1 v3.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT: uzp1 v1.8h, v3.8h, v1.8h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptos_v16f64_v16i16:
@@ -1312,25 +1282,19 @@ define <16 x i16> @fptou_v16f64_v16i16(<16 x double> %a) {
; CHECK-SD-LABEL: fptou_v16f64_v16i16:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fcvtzs v3.2d, v3.2d
-; CHECK-SD-NEXT: fcvtzs v7.2d, v7.2d
-; CHECK-SD-NEXT: adrp x8, .LCPI57_0
; CHECK-SD-NEXT: fcvtzs v2.2d, v2.2d
-; CHECK-SD-NEXT: fcvtzs v6.2d, v6.2d
; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d
-; CHECK-SD-NEXT: fcvtzs v5.2d, v5.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
+; CHECK-SD-NEXT: fcvtzs v7.2d, v7.2d
+; CHECK-SD-NEXT: fcvtzs v6.2d, v6.2d
+; CHECK-SD-NEXT: fcvtzs v5.2d, v5.2d
; CHECK-SD-NEXT: fcvtzs v4.2d, v4.2d
-; CHECK-SD-NEXT: xtn v19.2s, v3.2d
-; CHECK-SD-NEXT: xtn v23.2s, v7.2d
-; CHECK-SD-NEXT: xtn v18.2s, v2.2d
-; CHECK-SD-NEXT: xtn v22.2s, v6.2d
-; CHECK-SD-NEXT: xtn v17.2s, v1.2d
-; CHECK-SD-NEXT: xtn v21.2s, v5.2d
-; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI57_0]
-; CHECK-SD-NEXT: xtn v16.2s, v0.2d
-; CHECK-SD-NEXT: xtn v20.2s, v4.2d
-; CHECK-SD-NEXT: tbl v0.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v1.16b
-; CHECK-SD-NEXT: tbl v1.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v1.16b
+; CHECK-SD-NEXT: uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: uzp1 v1.4s, v6.4s, v7.4s
+; CHECK-SD-NEXT: uzp1 v3.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT: uzp1 v1.8h, v3.8h, v1.8h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptou_v16f64_v16i16:
@@ -1358,65 +1322,38 @@ entry:
define <32 x i16> @fptos_v32f64_v32i16(<32 x double> %a) {
; CHECK-SD-LABEL: fptos_v32f64_v32i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-SD-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-SD-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-SD-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-SD-NEXT: .cfi_def_cfa_offset 64
-; CHECK-SD-NEXT: .cfi_offset b8, -8
-; CHECK-SD-NEXT: .cfi_offset b9, -16
-; CHECK-SD-NEXT: .cfi_offset b10, -24
-; CHECK-SD-NEXT: .cfi_offset b11, -32
-; CHECK-SD-NEXT: .cfi_offset b12, -40
-; CHECK-SD-NEXT: .cfi_offset b13, -48
-; CHECK-SD-NEXT: .cfi_offset b14, -56
-; CHECK-SD-NEXT: .cfi_offset b15, -64
+; CHECK-SD-NEXT: ldp q16, q17, [sp, #64]
; CHECK-SD-NEXT: fcvtzs v3.2d, v3.2d
-; CHECK-SD-NEXT: fcvtzs v18.2d, v2.2d
-; CHECK-SD-NEXT: adrp x8, .LCPI58_0
-; CHECK-SD-NEXT: fcvtzs v19.2d, v1.2d
-; CHECK-SD-NE...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/81960
More information about the llvm-commits
mailing list