[llvm] e18b971 - [AArch64][InstCombine] Simplify repeated complex patterns in dupqlane
Matt Devereau via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 16 01:43:42 PST 2023
Author: Matt Devereau
Date: 2023-01-16T09:42:25Z
New Revision: e18b971685fb349299583d95716244f34f974ef8
URL: https://github.com/llvm/llvm-project/commit/e18b971685fb349299583d95716244f34f974ef8
DIFF: https://github.com/llvm/llvm-project/commit/e18b971685fb349299583d95716244f34f974ef8.diff
LOG: [AArch64][InstCombine] Simplify repeated complex patterns in dupqlane
Repeated floating-point complex patterns in dupqlane such as (f32 a, f32 b, f32
a, f32 b) can be simplified to shufflevector(f64(a, b), undef, 0)
Added:
llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-dupqlane.ll
Modified:
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index ce073ce64d48e..471b05ba379b2 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1436,6 +1436,86 @@ static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
return std::nullopt;
}
+bool SimplifyValuePattern(SmallVector<Value *> &Vec) {
+ size_t VecSize = Vec.size();
+ if (VecSize == 1)
+ return true;
+ if (!isPowerOf2_64(VecSize))
+ return false;
+ size_t HalfVecSize = VecSize / 2;
+
+ for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
+ RHS != Vec.end(); LHS++, RHS++) {
+ if (*LHS != nullptr && *RHS != nullptr && *LHS == *RHS)
+ continue;
+ return false;
+ }
+
+ Vec.resize(HalfVecSize);
+ SimplifyValuePattern(Vec);
+ return true;
+}
+
+// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
+// to dupqlane(f64(C)) where C is A concatenated with B
+static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
+ IntrinsicInst &II) {
+ Value *CurrentInsertElt = nullptr, *Default = nullptr;
+ if (!match(II.getOperand(0),
+ m_Intrinsic<Intrinsic::vector_insert>(
+ m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
+ !isa<FixedVectorType>(CurrentInsertElt->getType()))
+ return std::nullopt;
+ auto IIScalableTy = cast<ScalableVectorType>(II.getType());
+
+ // Insert the scalars into a container ordered by InsertElement index
+ SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
+ while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
+ auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
+ Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
+ CurrentInsertElt = InsertElt->getOperand(0);
+ }
+
+ if (!SimplifyValuePattern(Elts))
+ return std::nullopt;
+
+ // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
+ IRBuilder<> Builder(II.getContext());
+ Builder.SetInsertPoint(&II);
+ Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
+ for (size_t I = 0; I < Elts.size(); I++) {
+ InsertEltChain = Builder.CreateInsertElement(InsertEltChain, Elts[I],
+ Builder.getInt64(I));
+ }
+
+ // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
+ // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
+ // be bitcast to a type wide enough to fit the sequence, be splatted, and then
+ // be narrowed back to the original type.
+ unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
+ unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
+ IIScalableTy->getMinNumElements() /
+ PatternWidth;
+
+ IntegerType *WideTy = Builder.getIntNTy(PatternWidth);
+ auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
+ auto *WideShuffleMaskTy =
+ ScalableVectorType::get(Builder.getInt32Ty(), PatternElementCount);
+
+ auto ZeroIdx = ConstantInt::get(Builder.getInt64Ty(), APInt(64, 0));
+ auto InsertSubvector = Builder.CreateInsertVector(
+ II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx);
+ auto WideBitcast =
+ Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
+ auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
+ auto WideShuffle = Builder.CreateShuffleVector(
+ WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
+ auto NarrowBitcast =
+ Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
+
+ return IC.replaceInstUsesWith(II, NarrowBitcast);
+}
+
static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
IntrinsicInst &II) {
Value *A = II.getArgOperand(0);
@@ -1553,6 +1633,8 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
return instCombineSVESel(IC, II);
case Intrinsic::aarch64_sve_srshl:
return instCombineSVESrshl(IC, II);
+ case Intrinsic::aarch64_sve_dupq_lane:
+ return instCombineSVEDupqLane(IC, II);
}
return std::nullopt;
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
index 2f3f342cdb932..8dc0ab649b4d3 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
@@ -587,49 +587,35 @@ define <vscale x 2 x i64> @dupq_i64_range(<vscale x 2 x i64> %a) {
define dso_local <vscale x 4 x float> @dupq_f32_repeat_complex(float %x, float %y) {
; CHECK-LABEL: dupq_f32_repeat_complex:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT: mov v2.16b, v0.16b
+; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1
-; CHECK-NEXT: mov v2.s[1], v1.s[0]
-; CHECK-NEXT: mov v2.s[2], v0.s[0]
-; CHECK-NEXT: mov v2.s[3], v1.s[0]
-; CHECK-NEXT: mov z0.q, q2
+; CHECK-NEXT: mov v0.s[1], v1.s[0]
+; CHECK-NEXT: mov z0.d, d0
; CHECK-NEXT: ret
%1 = insertelement <4 x float> undef, float %x, i64 0
%2 = insertelement <4 x float> %1, float %y, i64 1
- %3 = insertelement <4 x float> %2, float %x, i64 2
- %4 = insertelement <4 x float> %3, float %y, i64 3
- %5 = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %4, i64 0)
- %6 = tail call <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> %5, i64 0)
+ %3 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %2, i64 0)
+ %4 = bitcast <vscale x 4 x float> %3 to <vscale x 2 x double>
+ %5 = shufflevector <vscale x 2 x double> %4, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+ %6 = bitcast <vscale x 2 x double> %5 to <vscale x 4 x float>
ret <vscale x 4 x float> %6
}
-define dso_local <vscale x 8 x half> @dupq_f16_repeat_complex(half %a, half %b) {
+define dso_local <vscale x 8 x half> @dupq_f16_repeat_complex(half %x, half %y) {
; CHECK-LABEL: dupq_f16_repeat_complex:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0
-; CHECK-NEXT: mov v2.16b, v0.16b
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
; CHECK-NEXT: // kill: def $h1 killed $h1 def $q1
-; CHECK-NEXT: mov v2.h[1], v1.h[0]
-; CHECK-NEXT: mov v2.h[2], v0.h[0]
-; CHECK-NEXT: mov v2.h[3], v1.h[0]
-; CHECK-NEXT: mov v2.h[4], v0.h[0]
-; CHECK-NEXT: mov v2.h[5], v1.h[0]
-; CHECK-NEXT: mov v2.h[6], v0.h[0]
-; CHECK-NEXT: mov v2.h[7], v1.h[0]
-; CHECK-NEXT: mov z0.q, q2
-; CHECK-NEXT: ret
- %1 = insertelement <8 x half> undef, half %a, i64 0
- %2 = insertelement <8 x half> %1, half %b, i64 1
- %3 = insertelement <8 x half> %2, half %a, i64 2
- %4 = insertelement <8 x half> %3, half %b, i64 3
- %5 = insertelement <8 x half> %4, half %a, i64 4
- %6 = insertelement <8 x half> %5, half %b, i64 5
- %7 = insertelement <8 x half> %6, half %a, i64 6
- %8 = insertelement <8 x half> %7, half %b, i64 7
- %9 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %8, i64 0)
- %10 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %9, i64 0)
- ret <vscale x 8 x half> %10
+; CHECK-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-NEXT: mov z0.s, s0
+; CHECK-NEXT: ret
+ %1 = insertelement <8 x half> undef, half %x, i64 0
+ %2 = insertelement <8 x half> %1, half %y, i64 1
+ %3 = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %2, i64 0)
+ %4 = bitcast <vscale x 8 x half> %3 to <vscale x 4 x float>
+ %5 = shufflevector <vscale x 4 x float> %4, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+ %6 = bitcast <vscale x 4 x float> %5 to <vscale x 8 x half>
+ ret <vscale x 8 x half> %6
}
define <vscale x 16 x i8> @ext_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-dupqlane.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-dupqlane.ll
new file mode 100644
index 0000000000000..9b375836c0fdf
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-dupqlane.ll
@@ -0,0 +1,209 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define dso_local <vscale x 4 x float> @dupq_f32_ab_pattern(float %x, float %y) {
+; CHECK-LABEL: @dupq_f32_ab_pattern(
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> poison, float [[X:%.*]], i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[Y:%.*]], i64 1
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> poison, <4 x float> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <vscale x 4 x float> [[TMP3]] to <vscale x 2 x i64>
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <vscale x 2 x i64> [[TMP5]] to <vscale x 4 x float>
+; CHECK-NEXT: ret <vscale x 4 x float> [[TMP6]]
+;
+ %1 = insertelement <4 x float> poison, float %x, i64 0
+ %2 = insertelement <4 x float> %1, float %y, i64 1
+ %3 = insertelement <4 x float> %2, float %x, i64 2
+ %4 = insertelement <4 x float> %3, float %y, i64 3
+ %5 = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> poison, <4 x float> %4, i64 0)
+ %6 = tail call <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> %5, i64 0)
+ ret <vscale x 4 x float> %6
+}
+
+define dso_local <vscale x 8 x half> @dupq_f16_a_pattern(half %a) {
+; CHECK-LABEL: @dupq_f16_a_pattern(
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> [[TMP3]], i64 0)
+; CHECK-NEXT: ret <vscale x 8 x half> [[TMP4]]
+;
+ %1 = insertelement <8 x half> poison, half %a, i64 0
+ %2 = insertelement <8 x half> %1, half %a, i64 1
+ %3 = insertelement <8 x half> %2, half %a, i64 2
+ %4 = insertelement <8 x half> %3, half %a, i64 3
+ %5 = insertelement <8 x half> %4, half %a, i64 4
+ %6 = insertelement <8 x half> %5, half %a, i64 5
+ %7 = insertelement <8 x half> %6, half %a, i64 6
+ %8 = insertelement <8 x half> %7, half %a, i64 7
+ %9 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %8, i64 0)
+ %10 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %9, i64 0)
+ ret <vscale x 8 x half> %10
+}
+
+define dso_local <vscale x 8 x half> @dupq_f16_ab_pattern(half %a, half %b) {
+; CHECK-LABEL: @dupq_f16_ab_pattern(
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <vscale x 8 x half> [[TMP3]] to <vscale x 4 x i32>
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <vscale x 4 x i32> [[TMP5]] to <vscale x 8 x half>
+; CHECK-NEXT: ret <vscale x 8 x half> [[TMP6]]
+;
+ %1 = insertelement <8 x half> poison, half %a, i64 0
+ %2 = insertelement <8 x half> %1, half %b, i64 1
+ %3 = insertelement <8 x half> %2, half %a, i64 2
+ %4 = insertelement <8 x half> %3, half %b, i64 3
+ %5 = insertelement <8 x half> %4, half %a, i64 4
+ %6 = insertelement <8 x half> %5, half %b, i64 5
+ %7 = insertelement <8 x half> %6, half %a, i64 6
+ %8 = insertelement <8 x half> %7, half %b, i64 7
+ %9 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %8, i64 0)
+ %10 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %9, i64 0)
+ ret <vscale x 8 x half> %10
+}
+
+define dso_local <vscale x 8 x half> @dupq_f16_abcd_pattern(half %a, half %b, half %c, half %d) {
+; CHECK-LABEL: @dupq_f16_abcd_pattern(
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[C:%.*]], i64 2
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[D:%.*]], i64 3
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> [[TMP4]], i64 0)
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <vscale x 8 x half> [[TMP5]] to <vscale x 2 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <vscale x 2 x i64> [[TMP7]] to <vscale x 8 x half>
+; CHECK-NEXT: ret <vscale x 8 x half> [[TMP8]]
+;
+ %1 = insertelement <8 x half> poison, half %a, i64 0
+ %2 = insertelement <8 x half> %1, half %b, i64 1
+ %3 = insertelement <8 x half> %2, half %c, i64 2
+ %4 = insertelement <8 x half> %3, half %d, i64 3
+ %5 = insertelement <8 x half> %4, half %a, i64 4
+ %6 = insertelement <8 x half> %5, half %b, i64 5
+ %7 = insertelement <8 x half> %6, half %c, i64 6
+ %8 = insertelement <8 x half> %7, half %d, i64 7
+ %9 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %8, i64 0)
+ %10 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %9, i64 0)
+ ret <vscale x 8 x half> %10
+}
+
+define dso_local <vscale x 8 x half> @dupq_f16_abcnull_pattern(half %a, half %b, half %c, half %d) {
+; CHECK-LABEL: @dupq_f16_abcnull_pattern(
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[C:%.*]], i64 2
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[A]], i64 4
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[B]], i64 5
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[C]], i64 6
+; CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> [[TMP6]], i64 0)
+; CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> [[TMP7]], i64 0)
+; CHECK-NEXT: ret <vscale x 8 x half> [[TMP8]]
+;
+ %1 = insertelement <8 x half> poison, half %a, i64 0
+ %2 = insertelement <8 x half> %1, half %b, i64 1
+ %3 = insertelement <8 x half> %2, half %c, i64 2
+ %4 = insertelement <8 x half> %3, half %a, i64 4
+ %5 = insertelement <8 x half> %4, half %b, i64 5
+ %6 = insertelement <8 x half> %5, half %c, i64 6
+ %7 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %6, i64 0)
+ %8 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %7, i64 0)
+ ret <vscale x 8 x half> %8
+}
+
+; Insert %c to override the last element in the insertelement chain, which will fail to combine
+
+define dso_local <vscale x 8 x half> @neg_dupq_f16_abcd_pattern_double_insert(half %a, half %b, half %c, half %d) {
+; CHECK-LABEL: @neg_dupq_f16_abcd_pattern_double_insert(
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[C:%.*]], i64 2
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[D:%.*]], i64 3
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[A]], i64 4
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[B]], i64 5
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half [[C]], i64 6
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x half> [[TMP7]], half [[C]], i64 7
+; CHECK-NEXT: [[TMP9:%.*]] = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> [[TMP8]], i64 0)
+; CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> [[TMP9]], i64 0)
+; CHECK-NEXT: ret <vscale x 8 x half> [[TMP10]]
+;
+ %1 = insertelement <8 x half> poison, half %a, i64 0
+ %2 = insertelement <8 x half> %1, half %b, i64 1
+ %3 = insertelement <8 x half> %2, half %c, i64 2
+ %4 = insertelement <8 x half> %3, half %d, i64 3
+ %5 = insertelement <8 x half> %4, half %a, i64 4
+ %6 = insertelement <8 x half> %5, half %b, i64 5
+ %7 = insertelement <8 x half> %6, half %c, i64 6
+ %8 = insertelement <8 x half> %7, half %d, i64 7
+ %9 = insertelement <8 x half> %8, half %c, i64 7
+ %10 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %9, i64 0)
+ %11 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %10, i64 0)
+ ret <vscale x 8 x half> %11
+}
+
+define dso_local <vscale x 8 x half> @dupq_f16_abcd_pattern_reverted_insert(half %a, half %b, half %c, half %d) {
+; CHECK-LABEL: @dupq_f16_abcd_pattern_reverted_insert(
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[C:%.*]], i64 2
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[D:%.*]], i64 3
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> [[TMP4]], i64 0)
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <vscale x 8 x half> [[TMP5]] to <vscale x 2 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <vscale x 2 x i64> [[TMP7]] to <vscale x 8 x half>
+; CHECK-NEXT: ret <vscale x 8 x half> [[TMP8]]
+;
+ %1 = insertelement <8 x half> poison, half %d, i64 7
+ %2 = insertelement <8 x half> %1, half %c, i64 6
+ %3 = insertelement <8 x half> %2, half %b, i64 5
+ %4 = insertelement <8 x half> %3, half %a, i64 4
+ %5 = insertelement <8 x half> %4, half %d, i64 3
+ %6 = insertelement <8 x half> %5, half %c, i64 2
+ %7 = insertelement <8 x half> %6, half %b, i64 1
+ %8 = insertelement <8 x half> %7, half %a, i64 0
+ %9 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %8, i64 0)
+ %10 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %9, i64 0)
+ ret <vscale x 8 x half> %10
+}
+
+define dso_local <vscale x 8 x half> @dupq_f16_ab_no_front_pattern(half %a, half %b) {
+; CHECK-LABEL: @dupq_f16_ab_no_front_pattern(
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[A]], i64 1
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[A]], i64 2
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[B:%.*]], i64 3
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[A]], i64 4
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[B]], i64 5
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half [[A]], i64 6
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x half> [[TMP7]], half [[B]], i64 7
+; CHECK-NEXT: [[TMP9:%.*]] = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> [[TMP8]], i64 0)
+; CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> [[TMP9]], i64 0)
+; CHECK-NEXT: ret <vscale x 8 x half> [[TMP10]]
+;
+ %1 = insertelement <8 x half> poison, half %a, i64 0
+ %2 = insertelement <8 x half> %1, half %a, i64 1
+ %3 = insertelement <8 x half> %2, half %a, i64 2
+ %4 = insertelement <8 x half> %3, half %b, i64 3
+ %5 = insertelement <8 x half> %4, half %a, i64 4
+ %6 = insertelement <8 x half> %5, half %b, i64 5
+ %7 = insertelement <8 x half> %6, half %a, i64 6
+ %8 = insertelement <8 x half> %7, half %b, i64 7
+ %9 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %8, i64 0)
+ %10 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %9, i64 0)
+ ret <vscale x 8 x half> %10
+}
+
+declare <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half>, <8 x half>, i64)
+declare <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half>, i64)
+declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float>, <4 x float>, i64)
+declare <vscale x 4 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 4 x float>, <2 x float>, i64)
+declare <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float>, i64)
+declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32>, <4 x i32>, i64)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32>, i64)
+declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16>, <8 x i16>, i64)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16>, i64)
+
+attributes #0 = { "target-features"="+sve" }
More information about the llvm-commits
mailing list