[llvm] 775451b - [AArch64] Cost-model vector splat LD1Rs to avoid unprofitable SLP vectorisation

Mon Mar 13 08:14:05 PDT 2023

Author: Sjoerd Meijer
Date: 2023-03-13T14:52:09Z
New Revision: 775451b66a4c726f1c3925f74be8cefcb308e4f8

URL: https://github.com/llvm/llvm-project/commit/775451b66a4c726f1c3925f74be8cefcb308e4f8
DIFF: https://github.com/llvm/llvm-project/commit/775451b66a4c726f1c3925f74be8cefcb308e4f8.diff

LOG: [AArch64] Cost-model vector splat LD1Rs to avoid unprofitable SLP vectorisation

This slightly increases the costs of InsertElement instructions that are part
of a vector splat sequence, i.e. a load, InsertElement and a shuffle (load +
dup). The resulting LD1R is a high latency instruction, and this slight
increase in costs avoids SLP vectorisation for a couple of cases where this
isn't profitable.

Fixes: https://github.com/llvm/llvm-project/issues/61047

Differential Revision: https://reviews.llvm.org/D145578

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
    llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 270c6d13c225b..33c6555cc710e 100644

--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3219,13 +3219,19 @@ InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
 
   Kind = improveShuffleKindFromMask(Kind, Mask);
 
-  // Check for broadcast loads.
-  if (Kind == TTI::SK_Broadcast) {
+  // Check for broadcast loads, which are supported by the LD1R instruction.
+  // In terms of code-size, the shuffle vector is free when a load + dup get
+  // folded into a LD1R. That's what we check and return here. For performance
+  // and reciprocal throughput, a LD1R is not completely free. In this case, we
+  // return the cost for the broadcast below (i.e. 1 for most/all types), so
+  // that we model the load + dup sequence slightly higher because LD1R is a
+  // high latency instruction.
+  if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
     bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
     if (IsLoad && LT.second.isVector() &&
         isLegalBroadcastLoad(Tp->getElementType(),
                              LT.second.getVectorElementCount()))
-      return 0; // broadcast is handled by ld1r
+      return 0;
   }
 
   // If we have 4 elements for the shuffle and a Mask, get the cost straight

diff  --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll
index adf908de47f0c..52e392b504402 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt < %s -mtriple=aarch64--linux-gnu -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s
+; RUN: opt < %s -mtriple=aarch64--linux-gnu -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck %s --check-prefix=CODESIZE
 
 ; These tests check the costs of ld1r instructions, through the
 ; isLegalBroadcastLoad method.
@@ -15,46 +16,93 @@ define void @shuffle() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %lv4i8 = load <4 x i8>, ptr undef, align 4
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sv4i8 = shufflevector <4 x i8> %lv4i8, <4 x i8> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv8i8 = load <8 x i8>, ptr undef, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv8i8 = shufflevector <8 x i8> %lv8i8, <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sv8i8 = shufflevector <8 x i8> %lv8i8, <8 x i8> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv16i8 = load <16 x i8>, ptr undef, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv16i8 = shufflevector <16 x i8> %lv16i8, <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sv16i8 = shufflevector <16 x i8> %lv16i8, <16 x i8> undef, <16 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %lv2i16 = load <2 x i16>, ptr undef, align 4
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sv2i16 = shufflevector <2 x i16> %lv2i16, <2 x i16> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv4i16 = load <4 x i16>, ptr undef, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv4i16 = shufflevector <4 x i16> %lv4i16, <4 x i16> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sv4i16 = shufflevector <4 x i16> %lv4i16, <4 x i16> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv8i16 = load <8 x i16>, ptr undef, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv8i16 = shufflevector <8 x i16> %lv8i16, <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sv8i16 = shufflevector <8 x i16> %lv8i16, <8 x i16> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %lv16i16 = load <16 x i16>, ptr undef, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv16i16 = shufflevector <16 x i16> %lv16i16, <16 x i16> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sv16i16 = shufflevector <16 x i16> %lv16i16, <16 x i16> undef, <16 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv2i32 = load <2 x i32>, ptr undef, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv2i32 = shufflevector <2 x i32> %lv2i32, <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sv2i32 = shufflevector <2 x i32> %lv2i32, <2 x i32> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv4i32 = load <4 x i32>, ptr undef, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv4i32 = shufflevector <4 x i32> %lv4i32, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sv4i32 = shufflevector <4 x i32> %lv4i32, <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %lv8i32 = load <8 x i32>, ptr undef, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv8i32 = shufflevector <8 x i32> %lv8i32, <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sv8i32 = shufflevector <8 x i32> %lv8i32, <8 x i32> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv2i64 = load <2 x i64>, ptr undef, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv2i64 = shufflevector <2 x i64> %lv2i64, <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sv2i64 = shufflevector <2 x i64> %lv2i64, <2 x i64> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %lv4i64 = load <4 x i64>, ptr undef, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv4i64 = shufflevector <4 x i64> %lv4i64, <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sv4i64 = shufflevector <4 x i64> %lv4i64, <4 x i64> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv2f16 = load <2 x half>, ptr undef, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv2f16 = shufflevector <2 x half> %lv2f16, <2 x half> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sv2f16 = shufflevector <2 x half> %lv2f16, <2 x half> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv4f16 = load <4 x half>, ptr undef, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv4f16 = shufflevector <4 x half> %lv4f16, <4 x half> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sv4f16 = shufflevector <4 x half> %lv4f16, <4 x half> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv8f16 = load <8 x half>, ptr undef, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv8f16 = shufflevector <8 x half> %lv8f16, <8 x half> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %sv8f16 = shufflevector <8 x half> %lv8f16, <8 x half> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %lv16f16 = load <16 x half>, ptr undef, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv16f16 = shufflevector <16 x half> %lv16f16, <16 x half> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %sv16f16 = shufflevector <16 x half> %lv16f16, <16 x half> undef, <16 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv2f32 = load <2 x float>, ptr undef, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv2f32 = shufflevector <2 x float> %lv2f32, <2 x float> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sv2f32 = shufflevector <2 x float> %lv2f32, <2 x float> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv4f32 = load <4 x float>, ptr undef, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv4f32 = shufflevector <4 x float> %lv4f32, <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sv4f32 = shufflevector <4 x float> %lv4f32, <4 x float> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %lv8f32 = load <8 x float>, ptr undef, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv8f32 = shufflevector <8 x float> %lv8f32, <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sv8f32 = shufflevector <8 x float> %lv8f32, <8 x float> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv2f64 = load <2 x double>, ptr undef, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv2f64 = shufflevector <2 x double> %lv2f64, <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sv2f64 = shufflevector <2 x double> %lv2f64, <2 x double> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %lv4f64 = load <4 x double>, ptr undef, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv4f64 = shufflevector <4 x double> %lv4f64, <4 x double> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sv4f64 = shufflevector <4 x double> %lv4f64, <4 x double> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CODESIZE-LABEL: 'shuffle'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv2i8 = load <2 x i8>, ptr undef, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sv2i8 = shufflevector <2 x i8> %lv2i8, <2 x i8> undef, <2 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv4i8 = load <4 x i8>, ptr undef, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sv4i8 = shufflevector <4 x i8> %lv4i8, <4 x i8> undef, <4 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv8i8 = load <8 x i8>, ptr undef, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv8i8 = shufflevector <8 x i8> %lv8i8, <8 x i8> undef, <8 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv16i8 = load <16 x i8>, ptr undef, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv16i8 = shufflevector <16 x i8> %lv16i8, <16 x i8> undef, <16 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv2i16 = load <2 x i16>, ptr undef, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sv2i16 = shufflevector <2 x i16> %lv2i16, <2 x i16> undef, <2 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv4i16 = load <4 x i16>, ptr undef, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv4i16 = shufflevector <4 x i16> %lv4i16, <4 x i16> undef, <4 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv8i16 = load <8 x i16>, ptr undef, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv8i16 = shufflevector <8 x i16> %lv8i16, <8 x i16> undef, <8 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %lv16i16 = load <16 x i16>, ptr undef, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv16i16 = shufflevector <16 x i16> %lv16i16, <16 x i16> undef, <16 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv2i32 = load <2 x i32>, ptr undef, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv2i32 = shufflevector <2 x i32> %lv2i32, <2 x i32> undef, <2 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv4i32 = load <4 x i32>, ptr undef, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv4i32 = shufflevector <4 x i32> %lv4i32, <4 x i32> undef, <4 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %lv8i32 = load <8 x i32>, ptr undef, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv8i32 = shufflevector <8 x i32> %lv8i32, <8 x i32> undef, <8 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv2i64 = load <2 x i64>, ptr undef, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv2i64 = shufflevector <2 x i64> %lv2i64, <2 x i64> undef, <2 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %lv4i64 = load <4 x i64>, ptr undef, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv4i64 = shufflevector <4 x i64> %lv4i64, <4 x i64> undef, <4 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv2f16 = load <2 x half>, ptr undef, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv2f16 = shufflevector <2 x half> %lv2f16, <2 x half> undef, <2 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv4f16 = load <4 x half>, ptr undef, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv4f16 = shufflevector <4 x half> %lv4f16, <4 x half> undef, <4 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv8f16 = load <8 x half>, ptr undef, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv8f16 = shufflevector <8 x half> %lv8f16, <8 x half> undef, <8 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %lv16f16 = load <16 x half>, ptr undef, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv16f16 = shufflevector <16 x half> %lv16f16, <16 x half> undef, <16 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv2f32 = load <2 x float>, ptr undef, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv2f32 = shufflevector <2 x float> %lv2f32, <2 x float> undef, <2 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv4f32 = load <4 x float>, ptr undef, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv4f32 = shufflevector <4 x float> %lv4f32, <4 x float> undef, <4 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %lv8f32 = load <8 x float>, ptr undef, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv8f32 = shufflevector <8 x float> %lv8f32, <8 x float> undef, <8 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lv2f64 = load <2 x double>, ptr undef, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv2f64 = shufflevector <2 x double> %lv2f64, <2 x double> undef, <2 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %lv4f64 = load <4 x double>, ptr undef, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sv4f64 = shufflevector <4 x double> %lv4f64, <4 x double> undef, <4 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %lv2i8 = load <2 x i8>, ptr undef
   %sv2i8 = shufflevector <2 x i8> %lv2i8, <2 x i8> undef, <2 x i32> zeroinitializer
@@ -119,6 +167,12 @@ define <4 x half> @ld1r_4h_float_shuff(ptr nocapture %x) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x half> %tmp1, <4 x half> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x half> %lane
 ;
+; CODESIZE-LABEL: 'ld1r_4h_float_shuff'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load half, ptr %x, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = insertelement <4 x half> undef, half %tmp, i32 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x half> %tmp1, <4 x half> undef, <4 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x half> %lane
+;
 entry:
   %tmp = load half, ptr %x, align 2
   %tmp1 = insertelement <4 x half> undef, half %tmp, i32 0
@@ -133,6 +187,12 @@ define <8 x half> @ld1r_8h_float_shuff(ptr nocapture %x) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %lane = shufflevector <8 x half> %tmp1, <8 x half> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x half> %lane
 ;
+; CODESIZE-LABEL: 'ld1r_8h_float_shuff'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load half, ptr %x, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = insertelement <8 x half> undef, half %tmp, i32 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %lane = shufflevector <8 x half> %tmp1, <8 x half> undef, <8 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x half> %lane
+;
 entry:
   %tmp = load half, ptr %x, align 2
   %tmp1 = insertelement <8 x half> undef, half %tmp, i32 0
@@ -147,6 +207,12 @@ define <2 x float> @ld1r_2s_float_shuff(ptr nocapture %x) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %lane
 ;
+; CODESIZE-LABEL: 'ld1r_2s_float_shuff'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load float, ptr %x, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = insertelement <2 x float> undef, float %tmp, i32 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %lane
+;
 entry:
   %tmp = load float, ptr %x, align 4
   %tmp1 = insertelement <2 x float> undef, float %tmp, i32 0
@@ -161,6 +227,12 @@ define <4 x float> @ld1r_4s_float_shuff(ptr nocapture %x) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %lane
 ;
+; CODESIZE-LABEL: 'ld1r_4s_float_shuff'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load float, ptr %x, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = insertelement <4 x float> undef, float %tmp, i32 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %lane
+;
 entry:
   %tmp = load float, ptr %x, align 4
   %tmp1 = insertelement <4 x float> undef, float %tmp, i32 0
@@ -175,6 +247,12 @@ define <2 x double> @ld1r_2d_double_shuff(ptr nocapture %x) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x double> %tmp1, <2 x double> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %lane
 ;
+; CODESIZE-LABEL: 'ld1r_2d_double_shuff'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load double, ptr %x, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = insertelement <2 x double> undef, double %tmp, i32 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x double> %tmp1, <2 x double> undef, <2 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %lane
+;
 entry:
   %tmp = load double, ptr %x, align 4
   %tmp1 = insertelement <2 x double> undef, double %tmp, i32 0
@@ -191,6 +269,12 @@ define <8 x i8> @ld1r_8b_int_shuff(ptr nocapture %x) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %lane
 ;
+; CODESIZE-LABEL: 'ld1r_8b_int_shuff'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i8, ptr %x, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <8 x i8> undef, i8 %tmp, i8 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %lane
+;
 entry:
   %tmp = load i8, ptr %x, align 2
   %tmp1 = insertelement <8 x i8> undef, i8 %tmp, i8 0
@@ -205,6 +289,12 @@ define <16 x i8> @ld1r_16b_int_shuff(ptr nocapture %x) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %lane
 ;
+; CODESIZE-LABEL: 'ld1r_16b_int_shuff'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i8, ptr %x, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <16 x i8> undef, i8 %tmp, i8 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %lane
+;
 entry:
   %tmp = load i8, ptr %x, align 2
   %tmp1 = insertelement <16 x i8> undef, i8 %tmp, i8 0
@@ -219,6 +309,12 @@ define <4 x i16> @ld1r_4h_int_shuff(ptr nocapture %x) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %lane
 ;
+; CODESIZE-LABEL: 'ld1r_4h_int_shuff'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i16, ptr %x, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <4 x i16> undef, i16 %tmp, i16 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i16> %lane
+;
 entry:
   %tmp = load i16, ptr %x, align 2
   %tmp1 = insertelement <4 x i16> undef, i16 %tmp, i16 0
@@ -233,6 +329,12 @@ define <8 x i16> @ld1r_8h_int_shuff(ptr nocapture %x) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %lane
 ;
+; CODESIZE-LABEL: 'ld1r_8h_int_shuff'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i16, ptr %x, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <8 x i16> undef, i16 %tmp, i16 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %lane
+;
 entry:
   %tmp = load i16, ptr %x, align 2
   %tmp1 = insertelement <8 x i16> undef, i16 %tmp, i16 0
@@ -247,6 +349,12 @@ define <2 x i32> @ld1r_2s_int_shuff(ptr nocapture %x) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %lane
 ;
+; CODESIZE-LABEL: 'ld1r_2s_int_shuff'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i32, ptr %x, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <2 x i32> undef, i32 %tmp, i32 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %lane
+;
 entry:
   %tmp = load i32, ptr %x, align 4
   %tmp1 = insertelement <2 x i32> undef, i32 %tmp, i32 0
@@ -261,6 +369,12 @@ define <4 x i32> @ld1r_4s_int_shuff(ptr nocapture %x) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %lane
 ;
+; CODESIZE-LABEL: 'ld1r_4s_int_shuff'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i32, ptr %x, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <4 x i32> undef, i32 %tmp, i32 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %lane
+;
 entry:
   %tmp = load i32, ptr %x, align 4
   %tmp1 = insertelement <4 x i32> undef, i32 %tmp, i32 0
@@ -275,6 +389,12 @@ define <2 x i64> @ld1r_2d_int_shuff(ptr nocapture %x) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x i64> %tmp1, <2 x i64> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %lane
 ;
+; CODESIZE-LABEL: 'ld1r_2d_int_shuff'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i64, ptr %x, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <2 x i64> undef, i64 %tmp, i32 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x i64> %tmp1, <2 x i64> undef, <2 x i32> zeroinitializer
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %lane
+;
 entry:
   %tmp = load i64, ptr %x, align 8
   %tmp1 = insertelement <2 x i64> undef, i64 %tmp, i32 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
index 3327cb0e51a8e..4214ecf0bb787 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
@@ -7,21 +7,20 @@ define void @slp_not_profitable_with_fast_fmf(ptr %A, ptr %B) {
 ; CHECK-LABEL: @slp_not_profitable_with_fast_fmf(
 ; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[MUL_0:%.*]] = fmul fast float [[B_1]], [[A_0]]
 ; CHECK-NEXT:    [[B_0:%.*]] = load float, ptr [[B]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[SHUFFLE1]], [[TMP1]]
-; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <2 x float> [[TMP1]], [[SHUFFLE]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fsub fast <2 x float> [[TMP5]], [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <2 x float> [[TMP5]], [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x float> [[TMP8]], ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    store float [[TMP9]], ptr [[B]], align 4
+; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2
+; CHECK-NEXT:    [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[B_2]], [[B_0]]
+; CHECK-NEXT:    [[SUB:%.*]] = fsub fast float [[MUL_0]], [[MUL_1]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[B_0]], [[B_1]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = fmul fast float [[B_2]], [[A_0]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL_3]], [[MUL_2]]
+; CHECK-NEXT:    store float [[SUB]], ptr [[A]], align 4
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1
+; CHECK-NEXT:    store float [[ADD]], ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    store float [[B_2]], ptr [[B]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1
@@ -47,21 +46,20 @@ define void @slp_not_profitable_with_reassoc_fmf(ptr %A, ptr %B) {
 ; CHECK-LABEL: @slp_not_profitable_with_reassoc_fmf(
 ; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[MUL_0:%.*]] = fmul reassoc float [[B_1]], [[A_0]]
 ; CHECK-NEXT:    [[B_0:%.*]] = load float, ptr [[B]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[SHUFFLE1]], [[TMP1]]
-; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul reassoc <2 x float> [[TMP1]], [[SHUFFLE]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fsub reassoc <2 x float> [[TMP5]], [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd reassoc <2 x float> [[TMP5]], [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x float> [[TMP8]], ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    store float [[TMP9]], ptr [[B]], align 4
+; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2
+; CHECK-NEXT:    [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul float [[B_2]], [[B_0]]
+; CHECK-NEXT:    [[SUB:%.*]] = fsub reassoc float [[MUL_0]], [[MUL_1]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul float [[B_0]], [[B_1]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = fmul reassoc float [[B_2]], [[A_0]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd reassoc float [[MUL_3]], [[MUL_2]]
+; CHECK-NEXT:    store float [[SUB]], ptr [[A]], align 4
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1
+; CHECK-NEXT:    store float [[ADD]], ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    store float [[B_2]], ptr [[B]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1
@@ -88,21 +86,20 @@ define void @slp_profitable_missing_fmf_on_fadd_fsub(ptr %A, ptr %B) {
 ; CHECK-LABEL: @slp_profitable_missing_fmf_on_fadd_fsub(
 ; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[MUL_0:%.*]] = fmul fast float [[B_1]], [[A_0]]
 ; CHECK-NEXT:    [[B_0:%.*]] = load float, ptr [[B]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[SHUFFLE1]], [[TMP1]]
-; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <2 x float> [[TMP1]], [[SHUFFLE]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fsub <2 x float> [[TMP5]], [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x float> [[TMP5]], [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x float> [[TMP8]], ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    store float [[TMP9]], ptr [[B]], align 4
+; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2
+; CHECK-NEXT:    [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[B_2]], [[B_0]]
+; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[MUL_0]], [[MUL_1]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[B_0]], [[B_1]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = fmul fast float [[B_2]], [[A_0]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[MUL_3]], [[MUL_2]]
+; CHECK-NEXT:    store float [[SUB]], ptr [[A]], align 4
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1
+; CHECK-NEXT:    store float [[ADD]], ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    store float [[B_2]], ptr [[B]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1
@@ -129,21 +126,20 @@ define void @slp_profitable_missing_fmf_on_fmul_fadd_fsub(ptr %A, ptr %B) {
 ; CHECK-LABEL: @slp_profitable_missing_fmf_on_fmul_fadd_fsub(
 ; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[MUL_0:%.*]] = fmul float [[B_1]], [[A_0]]
 ; CHECK-NEXT:    [[B_0:%.*]] = load float, ptr [[B]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[SHUFFLE1]], [[TMP1]]
-; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP1]], [[SHUFFLE]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fsub <2 x float> [[TMP5]], [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x float> [[TMP5]], [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x float> [[TMP8]], ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    store float [[TMP9]], ptr [[B]], align 4
+; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2
+; CHECK-NEXT:    [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul float [[B_2]], [[B_0]]
+; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[MUL_0]], [[MUL_1]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul float [[B_0]], [[B_1]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = fmul float [[B_2]], [[A_0]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[MUL_3]], [[MUL_2]]
+; CHECK-NEXT:    store float [[SUB]], ptr [[A]], align 4
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1
+; CHECK-NEXT:    store float [[ADD]], ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    store float [[B_2]], ptr [[B]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1
@@ -170,21 +166,20 @@ define void @slp_profitable_missing_fmf_nnans_only(ptr %A, ptr %B) {
 ; CHECK-LABEL: @slp_profitable_missing_fmf_nnans_only(
 ; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[MUL_0:%.*]] = fmul nnan float [[B_1]], [[A_0]]
 ; CHECK-NEXT:    [[B_0:%.*]] = load float, ptr [[B]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul nnan <2 x float> [[SHUFFLE1]], [[TMP1]]
-; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul nnan <2 x float> [[TMP1]], [[SHUFFLE]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fsub nnan <2 x float> [[TMP5]], [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd nnan <2 x float> [[TMP5]], [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x float> [[TMP8]], ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    store float [[TMP9]], ptr [[B]], align 4
+; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2
+; CHECK-NEXT:    [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul nnan float [[B_2]], [[B_0]]
+; CHECK-NEXT:    [[SUB:%.*]] = fsub nnan float [[MUL_0]], [[MUL_1]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul nnan float [[B_0]], [[B_1]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = fmul nnan float [[B_2]], [[A_0]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd nnan float [[MUL_3]], [[MUL_2]]
+; CHECK-NEXT:    store float [[SUB]], ptr [[A]], align 4
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1
+; CHECK-NEXT:    store float [[ADD]], ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    store float [[B_2]], ptr [[B]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1
@@ -267,16 +262,16 @@ define void @slp_profitable(ptr %A, ptr %B, float %0) {
 ; CHECK-NEXT:    [[SUB_I1096:%.*]] = fsub fast float 1.000000e+00, [[TMP0:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[SHUFFLE]]
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[SUB_I1096]], i32 0
-; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <2 x float> [[TMP1]], [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fadd fast <2 x float> [[SHUFFLE1]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fsub fast <2 x float> [[SHUFFLE1]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x float> [[TMP8]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[SUB_I1096]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast <2 x float> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fsub fast <2 x float> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x float> [[TMP11]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry: