[llvm] e18b971 - [AArch64][InstCombine] Simplify repeated complex patterns in dupqlane

Mon Jan 16 01:43:42 PST 2023

Author: Matt Devereau
Date: 2023-01-16T09:42:25Z
New Revision: e18b971685fb349299583d95716244f34f974ef8

URL: https://github.com/llvm/llvm-project/commit/e18b971685fb349299583d95716244f34f974ef8
DIFF: https://github.com/llvm/llvm-project/commit/e18b971685fb349299583d95716244f34f974ef8.diff

LOG: [AArch64][InstCombine] Simplify repeated complex patterns in dupqlane

Repeated floating-point complex patterns in dupqlane such as (f32 a, f32 b, f32
a, f32 b) can be simplified to shufflevector(f64(a, b), undef, 0)

Added: 
    llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-dupqlane.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
    llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index ce073ce64d48e..471b05ba379b2 100644

--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1436,6 +1436,86 @@ static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
   return std::nullopt;
 }
 
+bool SimplifyValuePattern(SmallVector<Value *> &Vec) {
+  size_t VecSize = Vec.size();
+  if (VecSize == 1)
+    return true;
+  if (!isPowerOf2_64(VecSize))
+    return false;
+  size_t HalfVecSize = VecSize / 2;
+
+  for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
+       RHS != Vec.end(); LHS++, RHS++) {
+    if (*LHS != nullptr && *RHS != nullptr && *LHS == *RHS)
+      continue;
+    return false;
+  }
+
+  Vec.resize(HalfVecSize);
+  SimplifyValuePattern(Vec);
+  return true;
+}
+
+// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
+// to dupqlane(f64(C)) where C is A concatenated with B
+static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
+                                                           IntrinsicInst &II) {
+  Value *CurrentInsertElt = nullptr, *Default = nullptr;
+  if (!match(II.getOperand(0),
+             m_Intrinsic<Intrinsic::vector_insert>(
+                 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
+      !isa<FixedVectorType>(CurrentInsertElt->getType()))
+    return std::nullopt;
+  auto IIScalableTy = cast<ScalableVectorType>(II.getType());
+
+  // Insert the scalars into a container ordered by InsertElement index
+  SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
+  while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
+    auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
+    Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
+    CurrentInsertElt = InsertElt->getOperand(0);
+  }
+
+  if (!SimplifyValuePattern(Elts))
+    return std::nullopt;
+
+  // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
+  IRBuilder<> Builder(II.getContext());
+  Builder.SetInsertPoint(&II);
+  Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
+  for (size_t I = 0; I < Elts.size(); I++) {
+    InsertEltChain = Builder.CreateInsertElement(InsertEltChain, Elts[I],
+                                                 Builder.getInt64(I));
+  }
+
+  // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
+  // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
+  // be bitcast to a type wide enough to fit the sequence, be splatted, and then
+  // be narrowed back to the original type.
+  unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
+  unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
+                                 IIScalableTy->getMinNumElements() /
+                                 PatternWidth;
+
+  IntegerType *WideTy = Builder.getIntNTy(PatternWidth);
+  auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
+  auto *WideShuffleMaskTy =
+      ScalableVectorType::get(Builder.getInt32Ty(), PatternElementCount);
+
+  auto ZeroIdx = ConstantInt::get(Builder.getInt64Ty(), APInt(64, 0));
+  auto InsertSubvector = Builder.CreateInsertVector(
+      II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx);
+  auto WideBitcast =
+      Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
+  auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
+  auto WideShuffle = Builder.CreateShuffleVector(
+      WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
+  auto NarrowBitcast =
+      Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
+
+  return IC.replaceInstUsesWith(II, NarrowBitcast);
+}
+
 static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
                                                         IntrinsicInst &II) {
   Value *A = II.getArgOperand(0);
@@ -1553,6 +1633,8 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
     return instCombineSVESel(IC, II);
   case Intrinsic::aarch64_sve_srshl:
     return instCombineSVESrshl(IC, II);
+  case Intrinsic::aarch64_sve_dupq_lane:
+    return instCombineSVEDupqLane(IC, II);
   }
 
   return std::nullopt;

diff  --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
index 2f3f342cdb932..8dc0ab649b4d3 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
@@ -587,49 +587,35 @@ define <vscale x 2 x i64> @dupq_i64_range(<vscale x 2 x i64> %a) {
 define dso_local <vscale x 4 x float> @dupq_f32_repeat_complex(float %x, float %y) {
 ; CHECK-LABEL: dupq_f32_repeat_complex:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    mov v2.16b, v0.16b
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-NEXT:    mov v2.s[1], v1.s[0]
-; CHECK-NEXT:    mov v2.s[2], v0.s[0]
-; CHECK-NEXT:    mov v2.s[3], v1.s[0]
-; CHECK-NEXT:    mov z0.q, q2
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    mov z0.d, d0
 ; CHECK-NEXT:    ret
   %1 = insertelement <4 x float> undef, float %x, i64 0
   %2 = insertelement <4 x float> %1, float %y, i64 1
-  %3 = insertelement <4 x float> %2, float %x, i64 2
-  %4 = insertelement <4 x float> %3, float %y, i64 3
-  %5 = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %4, i64 0)
-  %6 = tail call <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> %5, i64 0)
+  %3 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %2, i64 0)
+  %4 = bitcast <vscale x 4 x float> %3 to <vscale x 2 x double>
+  %5 = shufflevector <vscale x 2 x double> %4, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+  %6 = bitcast <vscale x 2 x double> %5 to <vscale x 4 x float>
   ret <vscale x 4 x float> %6
 }
 
-define dso_local <vscale x 8 x half> @dupq_f16_repeat_complex(half %a, half %b) {
+define dso_local <vscale x 8 x half> @dupq_f16_repeat_complex(half %x, half %y) {
 ; CHECK-LABEL: dupq_f16_repeat_complex:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $q0
-; CHECK-NEXT:    mov v2.16b, v0.16b
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    // kill: def $h1 killed $h1 def $q1
-; CHECK-NEXT:    mov v2.h[1], v1.h[0]
-; CHECK-NEXT:    mov v2.h[2], v0.h[0]
-; CHECK-NEXT:    mov v2.h[3], v1.h[0]
-; CHECK-NEXT:    mov v2.h[4], v0.h[0]
-; CHECK-NEXT:    mov v2.h[5], v1.h[0]
-; CHECK-NEXT:    mov v2.h[6], v0.h[0]
-; CHECK-NEXT:    mov v2.h[7], v1.h[0]
-; CHECK-NEXT:    mov z0.q, q2
-; CHECK-NEXT:    ret
-  %1 = insertelement <8 x half> undef, half %a, i64 0
-  %2 = insertelement <8 x half> %1, half %b, i64 1
-  %3 = insertelement <8 x half> %2, half %a, i64 2
-  %4 = insertelement <8 x half> %3, half %b, i64 3
-  %5 = insertelement <8 x half> %4, half %a, i64 4
-  %6 = insertelement <8 x half> %5, half %b, i64 5
-  %7 = insertelement <8 x half> %6, half %a, i64 6
-  %8 = insertelement <8 x half> %7, half %b, i64 7
-  %9 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %8, i64 0)
-  %10 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %9, i64 0)
-  ret <vscale x 8 x half> %10
+; CHECK-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-NEXT:    mov z0.s, s0
+; CHECK-NEXT:    ret
+  %1 = insertelement <8 x half> undef, half %x, i64 0
+  %2 = insertelement <8 x half> %1, half %y, i64 1
+  %3 = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %2, i64 0)
+  %4 = bitcast <vscale x 8 x half> %3 to <vscale x 4 x float>
+  %5 = shufflevector <vscale x 4 x float> %4, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+  %6 = bitcast <vscale x 4 x float> %5 to <vscale x 8 x half>
+  ret <vscale x 8 x half> %6
 }
 
 define <vscale x 16 x i8> @ext_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {

diff  --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-dupqlane.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-dupqlane.ll
new file mode 100644
index 0000000000000..9b375836c0fdf
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-dupqlane.ll
@@ -0,0 +1,209 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define dso_local <vscale x 4 x float> @dupq_f32_ab_pattern(float %x, float %y) {
+; CHECK-LABEL: @dupq_f32_ab_pattern(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> poison, float [[X:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[Y:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> poison, <4 x float> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 4 x float> [[TMP3]] to <vscale x 2 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 2 x i64> [[TMP5]] to <vscale x 4 x float>
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP6]]
+;
+  %1 = insertelement <4 x float> poison, float %x, i64 0
+  %2 = insertelement <4 x float> %1, float %y, i64 1
+  %3 = insertelement <4 x float> %2, float %x, i64 2
+  %4 = insertelement <4 x float> %3, float %y, i64 3
+  %5 = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> poison, <4 x float> %4, i64 0)
+  %6 = tail call <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> %5, i64 0)
+  ret <vscale x 4 x float> %6
+}
+
+define dso_local <vscale x 8 x half> @dupq_f16_a_pattern(half %a) {
+; CHECK-LABEL: @dupq_f16_a_pattern(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> [[TMP3]], i64 0)
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP4]]
+;
+  %1 = insertelement <8 x half> poison, half %a, i64 0
+  %2 = insertelement <8 x half> %1, half %a, i64 1
+  %3 = insertelement <8 x half> %2, half %a, i64 2
+  %4 = insertelement <8 x half> %3, half %a, i64 3
+  %5 = insertelement <8 x half> %4, half %a, i64 4
+  %6 = insertelement <8 x half> %5, half %a, i64 5
+  %7 = insertelement <8 x half> %6, half %a, i64 6
+  %8 = insertelement <8 x half> %7, half %a, i64 7
+  %9 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %8, i64 0)
+  %10 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %9, i64 0)
+  ret <vscale x 8 x half> %10
+}
+
+define dso_local <vscale x 8 x half> @dupq_f16_ab_pattern(half %a, half %b) {
+; CHECK-LABEL: @dupq_f16_ab_pattern(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x half> [[TMP3]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 4 x i32> [[TMP5]] to <vscale x 8 x half>
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP6]]
+;
+  %1 = insertelement <8 x half> poison, half %a, i64 0
+  %2 = insertelement <8 x half> %1, half %b, i64 1
+  %3 = insertelement <8 x half> %2, half %a, i64 2
+  %4 = insertelement <8 x half> %3, half %b, i64 3
+  %5 = insertelement <8 x half> %4, half %a, i64 4
+  %6 = insertelement <8 x half> %5, half %b, i64 5
+  %7 = insertelement <8 x half> %6, half %a, i64 6
+  %8 = insertelement <8 x half> %7, half %b, i64 7
+  %9 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %8, i64 0)
+  %10 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %9, i64 0)
+  ret <vscale x 8 x half> %10
+}
+
+define dso_local <vscale x 8 x half> @dupq_f16_abcd_pattern(half %a, half %b, half %c, half %d) {
+; CHECK-LABEL: @dupq_f16_abcd_pattern(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[C:%.*]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[D:%.*]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x half> [[TMP5]] to <vscale x 2 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 2 x i64> [[TMP7]] to <vscale x 8 x half>
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP8]]
+;
+  %1 = insertelement <8 x half> poison, half %a, i64 0
+  %2 = insertelement <8 x half> %1, half %b, i64 1
+  %3 = insertelement <8 x half> %2, half %c, i64 2
+  %4 = insertelement <8 x half> %3, half %d, i64 3
+  %5 = insertelement <8 x half> %4, half %a, i64 4
+  %6 = insertelement <8 x half> %5, half %b, i64 5
+  %7 = insertelement <8 x half> %6, half %c, i64 6
+  %8 = insertelement <8 x half> %7, half %d, i64 7
+  %9 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %8, i64 0)
+  %10 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %9, i64 0)
+  ret <vscale x 8 x half> %10
+}
+
+define dso_local <vscale x 8 x half> @dupq_f16_abcnull_pattern(half %a, half %b, half %c, half %d) {
+; CHECK-LABEL: @dupq_f16_abcnull_pattern(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[C:%.*]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[A]], i64 4
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[B]], i64 5
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[C]], i64 6
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> [[TMP7]], i64 0)
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP8]]
+;
+  %1 = insertelement <8 x half> poison, half %a, i64 0
+  %2 = insertelement <8 x half> %1, half %b, i64 1
+  %3 = insertelement <8 x half> %2, half %c, i64 2
+  %4 = insertelement <8 x half> %3, half %a, i64 4
+  %5 = insertelement <8 x half> %4, half %b, i64 5
+  %6 = insertelement <8 x half> %5, half %c, i64 6
+  %7 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %6, i64 0)
+  %8 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %7, i64 0)
+  ret <vscale x 8 x half> %8
+}
+
+; Insert %c to override the last element in the insertelement chain, which will fail to combine
+
+define dso_local <vscale x 8 x half> @neg_dupq_f16_abcd_pattern_double_insert(half %a, half %b, half %c, half %d) {
+; CHECK-LABEL: @neg_dupq_f16_abcd_pattern_double_insert(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[C:%.*]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[D:%.*]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[A]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[B]], i64 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half [[C]], i64 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x half> [[TMP7]], half [[C]], i64 7
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> [[TMP8]], i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> [[TMP9]], i64 0)
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP10]]
+;
+  %1 = insertelement <8 x half> poison, half %a, i64 0
+  %2 = insertelement <8 x half> %1, half %b, i64 1
+  %3 = insertelement <8 x half> %2, half %c, i64 2
+  %4 = insertelement <8 x half> %3, half %d, i64 3
+  %5 = insertelement <8 x half> %4, half %a, i64 4
+  %6 = insertelement <8 x half> %5, half %b, i64 5
+  %7 = insertelement <8 x half> %6, half %c, i64 6
+  %8 = insertelement <8 x half> %7, half %d, i64 7
+  %9 = insertelement <8 x half> %8, half %c, i64 7
+  %10 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %9, i64 0)
+  %11 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %10, i64 0)
+  ret <vscale x 8 x half> %11
+}
+
+define dso_local <vscale x 8 x half> @dupq_f16_abcd_pattern_reverted_insert(half %a, half %b, half %c, half %d) {
+; CHECK-LABEL: @dupq_f16_abcd_pattern_reverted_insert(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[C:%.*]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[D:%.*]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x half> [[TMP5]] to <vscale x 2 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 2 x i64> [[TMP7]] to <vscale x 8 x half>
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP8]]
+;
+  %1 = insertelement <8 x half> poison, half %d, i64 7
+  %2 = insertelement <8 x half> %1, half %c, i64 6
+  %3 = insertelement <8 x half> %2, half %b, i64 5
+  %4 = insertelement <8 x half> %3, half %a, i64 4
+  %5 = insertelement <8 x half> %4, half %d, i64 3
+  %6 = insertelement <8 x half> %5, half %c, i64 2
+  %7 = insertelement <8 x half> %6, half %b, i64 1
+  %8 = insertelement <8 x half> %7, half %a, i64 0
+  %9 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %8, i64 0)
+  %10 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %9, i64 0)
+  ret <vscale x 8 x half> %10
+}
+
+define dso_local <vscale x 8 x half> @dupq_f16_ab_no_front_pattern(half %a, half %b) {
+; CHECK-LABEL: @dupq_f16_ab_no_front_pattern(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[A]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[A]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[B:%.*]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[A]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[B]], i64 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half [[A]], i64 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x half> [[TMP7]], half [[B]], i64 7
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> [[TMP8]], i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> [[TMP9]], i64 0)
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP10]]
+;
+  %1 = insertelement <8 x half> poison, half %a, i64 0
+  %2 = insertelement <8 x half> %1, half %a, i64 1
+  %3 = insertelement <8 x half> %2, half %a, i64 2
+  %4 = insertelement <8 x half> %3, half %b, i64 3
+  %5 = insertelement <8 x half> %4, half %a, i64 4
+  %6 = insertelement <8 x half> %5, half %b, i64 5
+  %7 = insertelement <8 x half> %6, half %a, i64 6
+  %8 = insertelement <8 x half> %7, half %b, i64 7
+  %9 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %8, i64 0)
+  %10 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %9, i64 0)
+  ret <vscale x 8 x half> %10
+}
+
+declare <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half>, <8 x half>, i64)
+declare <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half>, i64)
+declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float>, <4 x float>, i64)
+declare <vscale x 4 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 4 x float>, <2 x float>, i64)
+declare <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float>, i64)
+declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32>, <4 x i32>, i64)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32>, i64)
+declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16>, <8 x i16>, i64)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16>, i64)
+
+attributes #0 = { "target-features"="+sve" }