[llvm] [AArch64] Change the cost of fma and fmuladd to match fmul. (PR #152963)

Thu Aug 14 13:42:10 PDT 2025

https://github.com/davemgreen updated https://github.com/llvm/llvm-project/pull/152963

>From c2e406d256a8736236d1779df1d23dada32c62aa Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Thu, 14 Aug 2025 21:41:54 +0100
Subject: [PATCH] [AArch64] Change the cost of fma and fmuladd to match fmul.

I believe that as fmul and fmadd are so similar, their performance
characteristics tend to be the same on most platforms, at least in terms of
reciprical throughputs. Processors capable of performing a given number of fmul
per cycle can usually perform the same number of fma, with the extra add being
relatively simple on top. This makes the scores of the two operations the same,
which brings the throughput cost of a fma/fmuladd to 2, and the latency to 3,
which are the defaults for fmul.

Note that we might also want to change the throughput cost of a fmul to 1, as
most processors have ample bandwodth for them, but they should still stay
in-line with one another.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    |  10 ++
 .../Analysis/CostModel/AArch64/arith-fp.ll    |  44 +++---
 .../CostModel/AArch64/sve-arith-fp.ll         |  32 ++--
 .../AArch64/f128-fmuladd-reduction.ll         | 148 +++++++++++-------
 .../AArch64/veclib-intrinsic-calls.ll         |   6 +-
 .../AArch64/reused-scalar-repeated-in-node.ll | 142 +++++------------
 .../AArch64/vec3-reorder-reshuffle.ll         |  39 +++--
 .../SLPVectorizer/insertelement-postpone.ll   |  22 ++-
 8 files changed, 214 insertions(+), 229 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 3fba7e853eafb..3042251cf754d 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -666,6 +666,16 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
       return LT.first;
     break;
   }
+  case Intrinsic::fma:
+  case Intrinsic::fmuladd: {
+    // Given a fma or fmuladd, cost it the same as a fmul instruction which are
+    // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
+    Type *EltTy = RetTy->getScalarType();
+    if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
+        (EltTy->isHalfTy() && ST->hasFullFP16()))
+      return getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
+    break;
+  }
   case Intrinsic::stepvector: {
     InstructionCost Cost = 1; // Cost of the `index' instruction
     auto LT = getTypeLegalizationCost(RetTy);
diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
index 0a154d09c36ba..c208d03ff94b7 100644
--- a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
@@ -663,13 +663,13 @@ define void @fcopysign_fp16() {
 
 define void @fma() {
 ; CHECK-LABEL: 'fma'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %V2F32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %F128 = call fp128 @llvm.fma.f128(fp128 undef, fp128 undef, fp128 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V2F128 = call <2 x fp128> @llvm.fma.v2f128(<2 x fp128> undef, <2 x fp128> undef, <2 x fp128> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
@@ -698,10 +698,10 @@ define void @fma_fp16() {
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-FP16-LABEL: 'fma_fp16'
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %F16 = call half @llvm.fma.f16(half undef, half undef, half undef)
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %V4F16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %V8F16 = call <8 x half> @llvm.fma.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 4 for: %V16F16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %F16 = call half @llvm.fma.f16(half undef, half undef, half undef)
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = call <8 x half> @llvm.fma.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %F16 = call half @llvm.fma.f16(half undef, half undef, half undef)
@@ -713,13 +713,13 @@ define void @fma_fp16() {
 
 define void @fmuladd() {
 ; CHECK-LABEL: 'fmuladd'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %F32 = call float @llvm.fmuladd.f32(float undef, float undef, float undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %V2F32 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %V4F32 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V8F32 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %F64 = call double @llvm.fmuladd.f64(double undef, double undef, double undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %V2F64 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4F64 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %F32 = call float @llvm.fmuladd.f32(float undef, float undef, float undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %F64 = call double @llvm.fmuladd.f64(double undef, double undef, double undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:2 Lat:6 SizeLat:2 for: %F128 = call fp128 @llvm.fmuladd.f128(fp128 undef, fp128 undef, fp128 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:2 Lat:6 SizeLat:2 for: %V2F128 = call <2 x fp128> @llvm.fmuladd.v2f128(<2 x fp128> undef, <2 x fp128> undef, <2 x fp128> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
@@ -748,10 +748,10 @@ define void @fmuladd_fp16() {
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-FP16-LABEL: 'fmuladd_fp16'
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %F16 = call half @llvm.fmuladd.f16(half undef, half undef, half undef)
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %V4F16 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %V8F16 = call <8 x half> @llvm.fmuladd.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 4 for: %V16F16 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %F16 = call half @llvm.fmuladd.f16(half undef, half undef, half undef)
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = call <8 x half> @llvm.fmuladd.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %F16 = call half @llvm.fmuladd.f32(half undef, half undef, half undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll b/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll
index f7ebd406d230a..1c40354892191 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll
@@ -167,14 +167,14 @@ define void @frem() {
 
 define void @fma() {
 ; CHECK-LABEL: 'fma'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V4F16 = call <vscale x 4 x half> @llvm.fma.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V8F16 = call <vscale x 8 x half> @llvm.fma.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V16F16 = call <vscale x 16 x half> @llvm.fma.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V2F32 = call <vscale x 2 x float> @llvm.fma.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V4F32 = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V8F32 = call <vscale x 8 x float> @llvm.fma.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V2F64 = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4F64 = call <vscale x 4 x double> @llvm.fma.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call <vscale x 4 x half> @llvm.fma.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = call <vscale x 8 x half> @llvm.fma.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = call <vscale x 16 x half> @llvm.fma.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = call <vscale x 2 x float> @llvm.fma.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = call <vscale x 8 x float> @llvm.fma.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = call <vscale x 4 x double> @llvm.fma.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V4F16 = call <vscale x 4 x half> @llvm.fma.v4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
@@ -193,14 +193,14 @@ define void @fma() {
 
 define void @fmuladd() {
 ; CHECK-LABEL: 'fmuladd'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V4F16 = call <vscale x 4 x half> @llvm.fmuladd.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V8F16 = call <vscale x 8 x half> @llvm.fmuladd.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V16F16 = call <vscale x 16 x half> @llvm.fmuladd.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V2F32 = call <vscale x 2 x float> @llvm.fmuladd.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V4F32 = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V8F32 = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V2F64 = call <vscale x 2 x double> @llvm.fmuladd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4F64 = call <vscale x 4 x double> @llvm.fmuladd.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call <vscale x 4 x half> @llvm.fmuladd.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = call <vscale x 8 x half> @llvm.fmuladd.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = call <vscale x 16 x half> @llvm.fmuladd.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = call <vscale x 2 x float> @llvm.fmuladd.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = call <vscale x 2 x double> @llvm.fmuladd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = call <vscale x 4 x double> @llvm.fmuladd.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V4F16 = call <vscale x 4 x half> @llvm.fmuladd.v4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/f128-fmuladd-reduction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/f128-fmuladd-reduction.ll
index d9710328d6048..f3542f63a4273 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/f128-fmuladd-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/f128-fmuladd-reduction.ll
@@ -4,73 +4,102 @@
 define double @fp128_fmuladd_reduction(ptr %start0, ptr %start1, ptr %end0, ptr %end1, double %x, i64 %n) {
 ; CHECK-LABEL: define double @fp128_fmuladd_reduction(
 ; CHECK-SAME: ptr [[START0:%.*]], ptr [[START1:%.*]], ptr [[END0:%.*]], ptr [[END1:%.*]], double [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:  [[ITER_CHECK:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[N_VEC]], 16
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[START0]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[N_VEC]], 8
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[START1]], i64 [[TMP2]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi double [ [[X]], %[[VECTOR_PH]] ], [ [[TMP29:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 16
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 32
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 48
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START0]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[START0]], i64 [[TMP4]]
-; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[START0]], i64 [[TMP5]]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi double [ [[X]], %[[VECTOR_PH]] ], [ [[TMP21:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[START0]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[START1]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr fp128, ptr [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr fp128, ptr [[TMP1]], i32 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr fp128, ptr [[TMP1]], i32 6
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x fp128>, ptr [[TMP1]], align 16
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <2 x fp128>, ptr [[TMP24]], align 16
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <2 x fp128>, ptr [[TMP4]], align 16
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <2 x fp128>, ptr [[TMP5]], align 16
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr double, ptr [[TMP3]], i32 2
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[TMP3]], i32 4
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr double, ptr [[TMP3]], i32 6
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x double>, ptr [[TMP3]], align 16
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <2 x double>, ptr [[TMP28]], align 16
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <2 x double>, ptr [[TMP35]], align 16
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x double>, ptr [[TMP36]], align 16
+; CHECK-NEXT:    [[TMP10:%.*]] = fptrunc <2 x fp128> [[WIDE_LOAD]] to <2 x double>
+; CHECK-NEXT:    [[TMP11:%.*]] = fptrunc <2 x fp128> [[WIDE_LOAD3]] to <2 x double>
+; CHECK-NEXT:    [[TMP12:%.*]] = fptrunc <2 x fp128> [[WIDE_LOAD4]] to <2 x double>
+; CHECK-NEXT:    [[TMP13:%.*]] = fptrunc <2 x fp128> [[WIDE_LOAD5]] to <2 x double>
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul <2 x double> [[TMP10]], [[WIDE_LOAD6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul <2 x double> [[TMP11]], [[WIDE_LOAD7]]
+; CHECK-NEXT:    [[TMP16:%.*]] = fmul <2 x double> [[TMP12]], [[WIDE_LOAD8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fmul <2 x double> [[TMP13]], [[WIDE_LOAD9]]
+; CHECK-NEXT:    [[TMP18:%.*]] = call double @llvm.vector.reduce.fadd.v2f64(double [[VEC_PHI]], <2 x double> [[TMP14]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call double @llvm.vector.reduce.fadd.v2f64(double [[TMP18]], <2 x double> [[TMP15]])
+; CHECK-NEXT:    [[TMP20:%.*]] = call double @llvm.vector.reduce.fadd.v2f64(double [[TMP19]], <2 x double> [[TMP16]])
+; CHECK-NEXT:    [[TMP21]] = call double @llvm.vector.reduce.fadd.v2f64(double [[TMP20]], <2 x double> [[TMP17]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[N_VEC]], 16
 ; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START0]], i64 [[TMP6]]
-; CHECK-NEXT:    [[OFFSET_IDX4:%.*]] = mul i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX4]], 8
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX4]], 16
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX4]], 24
-; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[START1]], i64 [[OFFSET_IDX4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[N_VEC]], 8
 ; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[START1]], i64 [[TMP7]]
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK:       [[VEC_EPILOG_PH]]:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP21]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[X]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_MOD_VF10:%.*]] = urem i64 [[N]], 2
+; CHECK-NEXT:    [[N_VEC11:%.*]] = sub i64 [[N]], [[N_MOD_VF10]]
+; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[N_VEC11]], 16
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[START0]], i64 [[TMP25]]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[N_VEC11]], 8
 ; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[START1]], i64 [[TMP8]]
+; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT19:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI13:%.*]] = phi double [ [[BC_MERGE_RDX]], %[[VEC_EPILOG_PH]] ], [ [[TMP33:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX12]], 16
+; CHECK-NEXT:    [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[START0]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[INDEX12]], 8
 ; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[START1]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load fp128, ptr [[NEXT_GEP]], align 16
-; CHECK-NEXT:    [[TMP11:%.*]] = load fp128, ptr [[NEXT_GEP1]], align 16
-; CHECK-NEXT:    [[TMP12:%.*]] = load fp128, ptr [[NEXT_GEP2]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = load fp128, ptr [[NEXT_GEP3]], align 16
-; CHECK-NEXT:    [[TMP14:%.*]] = load double, ptr [[NEXT_GEP5]], align 16
-; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr [[NEXT_GEP6]], align 16
-; CHECK-NEXT:    [[TMP16:%.*]] = load double, ptr [[NEXT_GEP7]], align 16
-; CHECK-NEXT:    [[TMP17:%.*]] = load double, ptr [[NEXT_GEP8]], align 16
-; CHECK-NEXT:    [[TMP18:%.*]] = fptrunc fp128 [[TMP10]] to double
-; CHECK-NEXT:    [[TMP19:%.*]] = fptrunc fp128 [[TMP11]] to double
-; CHECK-NEXT:    [[TMP20:%.*]] = fptrunc fp128 [[TMP12]] to double
-; CHECK-NEXT:    [[TMP21:%.*]] = fptrunc fp128 [[TMP13]] to double
-; CHECK-NEXT:    [[TMP22:%.*]] = fmul double [[TMP18]], [[TMP14]]
-; CHECK-NEXT:    [[TMP23:%.*]] = fmul double [[TMP19]], [[TMP15]]
-; CHECK-NEXT:    [[TMP24:%.*]] = fmul double [[TMP20]], [[TMP16]]
-; CHECK-NEXT:    [[TMP25:%.*]] = fmul double [[TMP21]], [[TMP17]]
-; CHECK-NEXT:    [[TMP26:%.*]] = fadd double [[VEC_PHI]], [[TMP22]]
-; CHECK-NEXT:    [[TMP27:%.*]] = fadd double [[TMP26]], [[TMP23]]
-; CHECK-NEXT:    [[TMP28:%.*]] = fadd double [[TMP27]], [[TMP24]]
-; CHECK-NEXT:    [[TMP29]] = fadd double [[TMP28]], [[TMP25]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[START0]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL9:%.*]] = phi ptr [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[START1]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL10:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP29]], %[[MIDDLE_BLOCK]] ], [ [[X]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <2 x fp128>, ptr [[NEXT_GEP14]], align 16
+; CHECK-NEXT:    [[WIDE_LOAD18:%.*]] = load <2 x double>, ptr [[NEXT_GEP8]], align 16
+; CHECK-NEXT:    [[TMP31:%.*]] = fptrunc <2 x fp128> [[WIDE_LOAD17]] to <2 x double>
+; CHECK-NEXT:    [[TMP32:%.*]] = fmul <2 x double> [[TMP31]], [[WIDE_LOAD18]]
+; CHECK-NEXT:    [[TMP33]] = call double @llvm.vector.reduce.fadd.v2f64(double [[VEC_PHI13]], <2 x double> [[TMP32]])
+; CHECK-NEXT:    [[INDEX_NEXT19]] = add nuw i64 [[INDEX12]], 2
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT19]], [[N_VEC11]]
+; CHECK-NEXT:    br i1 [[TMP34]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N20:%.*]] = icmp eq i64 [[N]], [[N_VEC11]]
+; CHECK-NEXT:    br i1 [[CMP_N20]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP26]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[NEXT_GEP3]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START0]], %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL21:%.*]] = phi ptr [ [[NEXT_GEP7]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[NEXT_GEP6]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START1]], %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL22:%.*]] = phi i64 [ [[N_VEC11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX23:%.*]] = phi double [ [[TMP33]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP21]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[X]], %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[PTR0:%.*]] = phi ptr [ [[PTR0_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[PTR1:%.*]] = phi ptr [ [[PTR1_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL9]], %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL10]], %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi double [ [[RED_NEXT:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[PTR0:%.*]] = phi ptr [ [[PTR0_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[PTR1:%.*]] = phi ptr [ [[PTR1_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL21]], %[[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL22]], %[[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi double [ [[RED_NEXT:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX23]], %[[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[PTR0_NEXT]] = getelementptr i8, ptr [[PTR0]], i64 16
 ; CHECK-NEXT:    [[PTR1_NEXT]] = getelementptr i8, ptr [[PTR1]], i64 8
 ; CHECK-NEXT:    [[LOAD0:%.*]] = load fp128, ptr [[PTR0]], align 16
@@ -79,9 +108,9 @@ define double @fp128_fmuladd_reduction(ptr %start0, ptr %start1, ptr %end0, ptr
 ; CHECK-NEXT:    [[RED_NEXT]] = tail call double @llvm.fmuladd.f64(double [[TRUNC]], double [[LOAD1]], double [[RED]])
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP1_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP1_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[LCSSA:%.*]] = phi double [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP29]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[LCSSA:%.*]] = phi double [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP21]], %[[MIDDLE_BLOCK]] ], [ [[TMP33]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret double [[LCSSA]]
 ;
 entry:
@@ -110,5 +139,6 @@ exit:
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll
index f6f2e39594dd8..64fc573e660bd 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll
@@ -1310,7 +1310,8 @@ define void @fma_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 ;
 ; LIBMVEC-SVE-LABEL: define void @fma_f64
 ; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD]])
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x double> [[WIDE_LOAD]])
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.fma.f64(double [[IN:%.*]], double [[IN]], double [[IN]])
 ;
 ; SLEEF-NEON-LABEL: define void @fma_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
@@ -1357,7 +1358,8 @@ define void @fma_f32(ptr noalias %in.ptr, ptr %out.ptr) {
 ;
 ; LIBMVEC-SVE-LABEL: define void @fma_f32
 ; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD]])
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[WIDE_LOAD]])
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.fma.f32(float [[IN:%.*]], float [[IN]], float [[IN]])
 ;
 ; SLEEF-NEON-LABEL: define void @fma_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
index cca58d8d66f04..26ce0fc6e6a3b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
@@ -12,8 +12,7 @@ define void @test() {
 ; CHECK:       [[BB63]]:
 ; CHECK-NEXT:    br label %[[BB64]]
 ; CHECK:       [[BB64]]:
-; CHECK-NEXT:    [[I65:%.*]] = phi nsz float [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ]
-; CHECK-NEXT:    [[I77:%.*]] = phi nsz float [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <16 x float> [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ]
 ; CHECK-NEXT:    [[I66:%.*]] = load float, ptr poison, align 16
 ; CHECK-NEXT:    [[I67:%.*]] = load float, ptr poison, align 4
 ; CHECK-NEXT:    [[I68:%.*]] = load float, ptr poison, align 8
@@ -25,122 +24,57 @@ define void @test() {
 ; CHECK-NEXT:    [[I74:%.*]] = load float, ptr poison, align 4
 ; CHECK-NEXT:    [[I75:%.*]] = load float, ptr poison, align 16
 ; CHECK-NEXT:    [[I76:%.*]] = load float, ptr poison, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x float> poison, float [[I76]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x float> [[TMP1]], float [[I75]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x float> [[TMP2]], float [[I74]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x float> [[TMP3]], float [[I73]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x float> [[TMP4]], float [[I71]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x float> [[TMP5]], float [[I70]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x float> [[TMP6]], float [[I68]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x float> [[TMP7]], float [[I66]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x float> [[TMP8]], float [[I72]], i32 13
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x float> [[TMP9]], float [[I67]], i32 14
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x float> [[TMP10]], float [[I69]], i32 15
 ; CHECK-NEXT:    br i1 poison, label %[[BB167:.*]], label %[[BB77:.*]]
 ; CHECK:       [[BB77]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> poison, float [[I70]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[I68]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[I67]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[I69]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[I66]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x float> [[TMP11]], <16 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 14, i32 15, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP13]], <8 x i32> <i32 8, i32 poison, i32 poison, i32 poison, i32 4, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <8 x float> [[TMP15]], float [[I68]], i32 2
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <8 x float> [[TMP16]], float [[I66]], i32 3
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <8 x float> [[TMP17]], float [[I67]], i32 6
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <8 x float> [[TMP18]], float [[I69]], i32 7
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 3, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <16 x float> [[TMP20]], <16 x float> [[TMP0]], <16 x i32> <i32 poison, i32 poison, i32 2, i32 3, i32 18, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 19, i32 poison, i32 poison>
 ; CHECK-NEXT:    br label %[[BB78:.*]]
 ; CHECK:       [[BB78]]:
-; CHECK-NEXT:    [[I85:%.*]] = phi nsz float [ [[I66]], %[[BB77]] ], [ [[TMP31:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[I80:%.*]] = phi nsz float [ [[I67]], %[[BB77]] ], [ [[TMP46:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[I81:%.*]] = phi nsz float [ [[I68]], %[[BB77]] ], [ [[TMP37:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[I82:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[TMP39:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[I84:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[TMP30:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[I127:%.*]] = phi nsz float [ [[I69]], %[[BB77]] ], [ [[TMP53:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[I131:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[TMP36:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[I86:%.*]] = phi nsz float [ [[I70]], %[[BB77]] ], [ [[TMP40:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x float> [ [[TMP0]], %[[BB77]] ], [ [[TMP38:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x float> [ [[TMP1]], %[[BB77]] ], [ [[TMP35:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x float> [ [[TMP3]], %[[BB77]] ], [ [[TMP32:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x float> [ [[TMP4]], %[[BB77]] ], [ [[TMP29:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast <2 x float> [[TMP8]], poison
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast <2 x float> [[TMP7]], poison
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast <2 x float> [[TMP6]], poison
-; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <2 x float> [[TMP5]], poison
-; CHECK-NEXT:    [[TMP13:%.*]] = fmul fast <2 x float> [[TMP8]], poison
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP15:%.*]] = fmul fast <2 x float> [[TMP7]], poison
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP15]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP17:%.*]] = fmul fast <2 x float> [[TMP6]], poison
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP17]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP19:%.*]] = fmul fast <2 x float> [[TMP5]], poison
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x float> [[TMP19]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP21:%.*]] = fadd fast <2 x float> [[TMP14]], [[TMP9]]
-; CHECK-NEXT:    [[TMP22:%.*]] = fadd fast <2 x float> [[TMP21]], poison
-; CHECK-NEXT:    [[TMP23:%.*]] = fadd fast <2 x float> [[TMP16]], [[TMP10]]
-; CHECK-NEXT:    [[TMP24:%.*]] = fadd fast <2 x float> [[TMP23]], poison
-; CHECK-NEXT:    [[TMP25:%.*]] = fadd fast <2 x float> [[TMP18]], [[TMP11]]
-; CHECK-NEXT:    [[TMP26:%.*]] = fadd fast <2 x float> [[TMP25]], poison
-; CHECK-NEXT:    [[TMP27:%.*]] = fadd fast <2 x float> [[TMP20]], [[TMP12]]
-; CHECK-NEXT:    [[TMP28:%.*]] = fadd fast <2 x float> [[TMP27]], poison
-; CHECK-NEXT:    [[TMP29]] = fadd fast <2 x float> [[TMP22]], poison
-; CHECK-NEXT:    [[TMP30]] = extractelement <2 x float> [[TMP29]], i32 1
-; CHECK-NEXT:    [[TMP31]] = extractelement <2 x float> [[TMP29]], i32 0
-; CHECK-NEXT:    [[TMP32]] = fadd fast <2 x float> [[TMP24]], poison
-; CHECK-NEXT:    [[TMP53]] = extractelement <2 x float> [[TMP32]], i32 1
-; CHECK-NEXT:    [[TMP46]] = extractelement <2 x float> [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP35]] = fadd fast <2 x float> [[TMP26]], poison
-; CHECK-NEXT:    [[TMP36]] = extractelement <2 x float> [[TMP35]], i32 1
-; CHECK-NEXT:    [[TMP37]] = extractelement <2 x float> [[TMP35]], i32 0
-; CHECK-NEXT:    [[TMP38]] = fadd fast <2 x float> [[TMP28]], poison
-; CHECK-NEXT:    [[TMP39]] = extractelement <2 x float> [[TMP38]], i32 1
-; CHECK-NEXT:    [[TMP40]] = extractelement <2 x float> [[TMP38]], i32 0
-; CHECK-NEXT:    [[I135:%.*]] = fmul fast float [[I85]], [[I65]]
-; CHECK-NEXT:    [[I128:%.*]] = fmul fast float [[I80]], [[I65]]
-; CHECK-NEXT:    [[I129:%.*]] = fmul fast float [[I81]], [[I65]]
-; CHECK-NEXT:    [[I130:%.*]] = fmul fast float [[I82]], [[I65]]
-; CHECK-NEXT:    [[I133:%.*]] = fmul fast float [[I84]], [[I77]]
-; CHECK-NEXT:    [[I136:%.*]] = fmul fast float [[I127]], [[I77]]
-; CHECK-NEXT:    [[I138:%.*]] = fmul fast float [[I131]], [[I77]]
-; CHECK-NEXT:    [[I137:%.*]] = fmul fast float [[I86]], [[I77]]
-; CHECK-NEXT:    [[OP_RDX14:%.*]] = fadd fast float poison, [[I133]]
-; CHECK-NEXT:    [[OP_RDX15:%.*]] = fadd fast float [[OP_RDX14]], [[I135]]
-; CHECK-NEXT:    [[OP_RDX12:%.*]] = fadd fast float poison, [[I136]]
-; CHECK-NEXT:    [[OP_RDX13:%.*]] = fadd fast float [[OP_RDX12]], [[I128]]
-; CHECK-NEXT:    [[OP_RDX10:%.*]] = fadd fast float poison, [[I138]]
-; CHECK-NEXT:    [[OP_RDX11:%.*]] = fadd fast float [[OP_RDX10]], [[I129]]
-; CHECK-NEXT:    [[OP_RDX8:%.*]] = fadd fast float poison, [[I137]]
-; CHECK-NEXT:    [[OP_RDX9:%.*]] = fadd fast float [[OP_RDX8]], [[I130]]
-; CHECK-NEXT:    [[TMP41:%.*]] = fmul fast <2 x float> [[TMP8]], poison
-; CHECK-NEXT:    [[TMP42:%.*]] = fmul fast <2 x float> [[TMP7]], poison
-; CHECK-NEXT:    [[TMP43:%.*]] = fmul fast <2 x float> [[TMP6]], poison
-; CHECK-NEXT:    [[TMP44:%.*]] = fmul fast <2 x float> [[TMP5]], poison
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <2 x float> [[TMP41]], i32 1
-; CHECK-NEXT:    [[I157:%.*]] = fadd fast float poison, [[TMP45]]
-; CHECK-NEXT:    [[I150:%.*]] = extractelement <2 x float> [[TMP41]], i32 0
-; CHECK-NEXT:    [[TMP60:%.*]] = fadd fast float [[I157]], [[I150]]
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <2 x float> [[TMP42]], i32 1
-; CHECK-NEXT:    [[OP_RDX4:%.*]] = fadd fast float poison, [[TMP47]]
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x float> [[TMP42]], i32 0
-; CHECK-NEXT:    [[OP_RDX5:%.*]] = fadd fast float [[OP_RDX4]], [[TMP48]]
-; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <2 x float> [[TMP43]], i32 1
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = fadd fast float poison, [[TMP49]]
-; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <2 x float> [[TMP43]], i32 0
-; CHECK-NEXT:    [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP50]]
-; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <2 x float> [[TMP44]], i32 0
-; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float poison, [[TMP51]]
-; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <2 x float> [[TMP44]], i32 1
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP52]]
+; CHECK-NEXT:    [[TMP22:%.*]] = phi <8 x float> [ [[TMP14]], %[[BB77]] ], [ [[TMP31:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = phi <8 x float> [ [[TMP19]], %[[BB77]] ], [ [[TMP32:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <8 x float> [[TMP23]], <8 x float> poison, <16 x i32> <i32 0, i32 3, i32 1, i32 2, i32 3, i32 0, i32 2, i32 3, i32 2, i32 6, i32 2, i32 3, i32 0, i32 7, i32 6, i32 6>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <8 x float> [[TMP22]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 3, i32 1, i32 3, i32 5, i32 3, i32 1, i32 0, i32 4, i32 5, i32 5>
+; CHECK-NEXT:    [[TMP26:%.*]] = fmul fast <16 x float> [[TMP24]], [[TMP21]]
+; CHECK-NEXT:    [[TMP27:%.*]] = fmul fast <16 x float> [[TMP25]], [[TMP0]]
+; CHECK-NEXT:    [[TMP28:%.*]] = fadd fast <16 x float> [[TMP27]], [[TMP26]]
+; CHECK-NEXT:    [[TMP29:%.*]] = fadd fast <16 x float> [[TMP28]], poison
+; CHECK-NEXT:    [[TMP30:%.*]] = fadd fast <16 x float> [[TMP29]], poison
+; CHECK-NEXT:    [[TMP31]] = shufflevector <16 x float> [[TMP30]], <16 x float> poison, <8 x i32> <i32 5, i32 11, i32 12, i32 10, i32 14, i32 15, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP32]] = shufflevector <16 x float> [[TMP30]], <16 x float> poison, <8 x i32> <i32 12, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 14, i32 15>
 ; CHECK-NEXT:    br i1 poison, label %[[BB78]], label %[[BB167]]
 ; CHECK:       [[BB167]]:
-; CHECK-NEXT:    [[I168:%.*]] = phi nsz float [ [[I76]], %[[BB64]] ], [ [[OP_RDX1]], %[[BB78]] ]
-; CHECK-NEXT:    [[I169:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[OP_RDX3]], %[[BB78]] ]
-; CHECK-NEXT:    [[I170:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[OP_RDX5]], %[[BB78]] ]
-; CHECK-NEXT:    [[I171:%.*]] = phi nsz float [ [[I75]], %[[BB64]] ], [ [[TMP60]], %[[BB78]] ]
-; CHECK-NEXT:    [[I172:%.*]] = phi nsz float [ [[I74]], %[[BB64]] ], [ [[OP_RDX9]], %[[BB78]] ]
-; CHECK-NEXT:    [[I173:%.*]] = phi nsz float [ [[I73]], %[[BB64]] ], [ [[OP_RDX11]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP34:%.*]] = phi nsz float [ [[I72]], %[[BB64]] ], [ [[OP_RDX13]], %[[BB78]] ]
-; CHECK-NEXT:    [[I175:%.*]] = phi nsz float [ [[I71]], %[[BB64]] ], [ [[OP_RDX15]], %[[BB78]] ]
-; CHECK-NEXT:    [[I176:%.*]] = phi nsz float [ [[I70]], %[[BB64]] ], [ [[TMP40]], %[[BB78]] ]
-; CHECK-NEXT:    [[I177:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[TMP36]], %[[BB78]] ]
-; CHECK-NEXT:    [[I178:%.*]] = phi nsz float [ [[I69]], %[[BB64]] ], [ [[TMP53]], %[[BB78]] ]
-; CHECK-NEXT:    [[I179:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[TMP30]], %[[BB78]] ]
-; CHECK-NEXT:    [[I180:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[TMP39]], %[[BB78]] ]
-; CHECK-NEXT:    [[I181:%.*]] = phi nsz float [ [[I68]], %[[BB64]] ], [ [[TMP37]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP33:%.*]] = phi nsz float [ [[I67]], %[[BB64]] ], [ [[TMP46]], %[[BB78]] ]
-; CHECK-NEXT:    [[I183:%.*]] = phi nsz float [ [[I66]], %[[BB64]] ], [ [[TMP31]], %[[BB78]] ]
+; CHECK-NEXT:    [[TMP35:%.*]] = phi <16 x float> [ [[TMP11]], %[[BB64]] ], [ [[TMP30]], %[[BB78]] ]
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <16 x float> [[TMP35]], i32 14
 ; CHECK-NEXT:    store float [[TMP33]], ptr poison, align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x float> [[TMP35]], i32 13
 ; CHECK-NEXT:    store float [[TMP34]], ptr poison, align 1
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <16 x float> [[TMP35]], i32 15
 ; CHECK-NEXT:    br i1 poison, label %[[BB186:.*]], label %[[BB184:.*]]
 ; CHECK:       [[BB184]]:
 ; CHECK-NEXT:    br label %[[BB185:.*]]
 ; CHECK:       [[BB185]]:
 ; CHECK-NEXT:    br i1 poison, label %[[BB185]], label %[[BB186]]
 ; CHECK:       [[BB186]]:
-; CHECK-NEXT:    [[I187:%.*]] = phi nsz float [ [[I178]], %[[BB167]] ], [ poison, %[[BB185]] ]
+; CHECK-NEXT:    [[I187:%.*]] = phi nsz float [ [[TMP36]], %[[BB167]] ], [ poison, %[[BB185]] ]
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
index 2b591a2165534..683b92752c702 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
@@ -402,19 +402,32 @@ entry:
 }
 
 define void @reuse_shuffle_indices_cost_crash_2(ptr %bezt, float %0) {
-; CHECK-LABEL: define void @reuse_shuffle_indices_cost_crash_2(
-; CHECK-SAME: ptr [[BEZT:%.*]], float [[TMP0:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[FNEG:%.*]] = fmul float [[TMP0]], 0.000000e+00
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[FNEG]], float 0.000000e+00)
-; CHECK-NEXT:    store float [[TMP1]], ptr [[BEZT]], align 4
-; CHECK-NEXT:    [[ARRAYIDX5_I:%.*]] = getelementptr float, ptr [[BEZT]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[FNEG]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> [[TMP4]], <2 x float> zeroinitializer)
-; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr [[ARRAYIDX5_I]], align 4
-; CHECK-NEXT:    ret void
+; NON-POW2-LABEL: define void @reuse_shuffle_indices_cost_crash_2(
+; NON-POW2-SAME: ptr [[BEZT:%.*]], float [[TMP0:%.*]]) {
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[FNEG:%.*]] = fmul float [[TMP0]], 0.000000e+00
+; NON-POW2-NEXT:    [[TMP1:%.*]] = insertelement <3 x float> poison, float [[FNEG]], i32 0
+; NON-POW2-NEXT:    [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> zeroinitializer
+; NON-POW2-NEXT:    [[TMP3:%.*]] = insertelement <3 x float> <float poison, float poison, float 0.000000e+00>, float [[TMP0]], i32 0
+; NON-POW2-NEXT:    [[TMP4:%.*]] = shufflevector <3 x float> [[TMP3]], <3 x float> poison, <3 x i32> <i32 0, i32 0, i32 2>
+; NON-POW2-NEXT:    [[TMP5:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP2]], <3 x float> [[TMP4]], <3 x float> zeroinitializer)
+; NON-POW2-NEXT:    store <3 x float> [[TMP5]], ptr [[BEZT]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: define void @reuse_shuffle_indices_cost_crash_2(
+; POW2-ONLY-SAME: ptr [[BEZT:%.*]], float [[TMP0:%.*]]) {
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[FNEG:%.*]] = fmul float [[TMP0]], 0.000000e+00
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[FNEG]], i32 0
+; POW2-ONLY-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> zeroinitializer
+; POW2-ONLY-NEXT:    [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> [[TMP4]], <2 x float> zeroinitializer)
+; POW2-ONLY-NEXT:    store <2 x float> [[TMP5]], ptr [[BEZT]], align 4
+; POW2-ONLY-NEXT:    [[TMP6:%.*]] = tail call float @llvm.fmuladd.f32(float [[FNEG]], float 0.000000e+00, float 0.000000e+00)
+; POW2-ONLY-NEXT:    [[ARRAYIDX8_I831:%.*]] = getelementptr float, ptr [[BEZT]], i64 2
+; POW2-ONLY-NEXT:    store float [[TMP6]], ptr [[ARRAYIDX8_I831]], align 4
+; POW2-ONLY-NEXT:    ret void
 ;
 entry:
   %fneg = fmul float %0, 0.000000e+00
diff --git a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
index b5d74f0b91ab8..eefc99feebb95 100644
--- a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
+++ b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
@@ -24,21 +24,17 @@ define <4 x double> @test(ptr %p2, double %i1754, double %i1781, double %i1778)
 ; AARCH86-NEXT:  entry:
 ; AARCH86-NEXT:    [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54
 ; AARCH86-NEXT:    [[I1772:%.*]] = load double, ptr [[I1771]], align 8
-; AARCH86-NEXT:    [[I1773:%.*]] = fmul fast double [[I1772]], [[I1754:%.*]]
-; AARCH86-NEXT:    [[I1782:%.*]] = fmul fast double [[I1754]], [[I1754]]
-; AARCH86-NEXT:    [[I1783:%.*]] = fadd fast double [[I1782]], 1.000000e+00
-; AARCH86-NEXT:    [[I1787:%.*]] = fmul fast double [[I1778:%.*]], [[I1754]]
-; AARCH86-NEXT:    [[I1788:%.*]] = fadd fast double [[I1787]], 1.000000e+00
-; AARCH86-NEXT:    [[I1792:%.*]] = fmul fast double [[I1754]], [[I1781:%.*]]
-; AARCH86-NEXT:    [[I1793:%.*]] = fadd fast double [[I1792]], 1.000000e+00
 ; AARCH86-NEXT:    [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55
 ; AARCH86-NEXT:    [[I1796:%.*]] = load double, ptr [[I1795]], align 8
-; AARCH86-NEXT:    [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781]]
-; AARCH86-NEXT:    [[TMP4:%.*]] = fadd fast double [[I1773]], [[I1797]]
-; AARCH86-NEXT:    [[I1976:%.*]] = insertelement <4 x double> zeroinitializer, double [[I1783]], i64 0
-; AARCH86-NEXT:    [[I1982:%.*]] = insertelement <4 x double> [[I1976]], double [[I1788]], i64 1
-; AARCH86-NEXT:    [[I1988:%.*]] = insertelement <4 x double> [[I1982]], double [[I1793]], i64 2
-; AARCH86-NEXT:    [[I1994:%.*]] = insertelement <4 x double> [[I1988]], double [[TMP4]], i64 3
+; AARCH86-NEXT:    [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781:%.*]]
+; AARCH86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> poison, double [[I1754:%.*]], i32 0
+; AARCH86-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[I1778:%.*]], i32 1
+; AARCH86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781]], i32 2
+; AARCH86-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[I1772]], i32 3
+; AARCH86-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer
+; AARCH86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP4]]
+; AARCH86-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, double [[I1797]], i32 3
+; AARCH86-NEXT:    [[I1994:%.*]] = fadd fast <4 x double> [[TMP5]], [[TMP6]]
 ; AARCH86-NEXT:    ret <4 x double> [[I1994]]
 ;
 entry: