[llvm] 54177e9 - [Matrix] Use tiled loops automatically for large kernels. (#179325)

via llvm-commits llvm-commits at lists.llvm.org
Wed Feb 11 07:36:40 PST 2026


Author: Florian Hahn
Date: 2026-02-11T15:36:34Z
New Revision: 54177e95d1956aa040503d683762d4bfa3ff954b

URL: https://github.com/llvm/llvm-project/commit/54177e95d1956aa040503d683762d4bfa3ff954b
DIFF: https://github.com/llvm/llvm-project/commit/54177e95d1956aa040503d683762d4bfa3ff954b.diff

LOG: [Matrix] Use tiled loops automatically for large kernels. (#179325)

Update LowerMatrixIntrinsics to use tiled loops automatically in for
larger matrixes. The fully unrolled codegen creates a huge amount of
code, which performs noticably worse then the tiled loop nest variant.

We new try to estimate the number of instructions needed for the
multiply, and if it is too large, tiled loops are used. The current
threshold is anything roughly larger than 6x6x6 double multiply.

Eventually I think we want to only generate tiled loops. This patch is a
first step, trying to opt in for cases where we know it is beneficial.
Checked on AArch64, but should help on other architectures similarly,
and also drastically reduce binary size + compile time.

PR: https://github.com/llvm/llvm-project/pull/179325

Added: 
    

Modified: 
    llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
    llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
    llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll
    llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops-large-matrixes.ll
    llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll
    llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-volatile.ll
    llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 5d558c2f7a341..ecf295dc75c3a 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -68,9 +68,10 @@ static cl::opt<unsigned> TileSize(
     "fuse-matrix-tile-size", cl::init(4), cl::Hidden,
     cl::desc(
         "Tile size for matrix instruction fusion using square-shaped tiles."));
-static cl::opt<bool> TileUseLoops("fuse-matrix-use-loops", cl::init(false),
-                                  cl::Hidden,
-                                  cl::desc("Generate loop nest for tiling."));
+static cl::opt<unsigned>
+    TileLoopsThreshold("fuse-matrix-loops-threshold", cl::init(200), cl::Hidden,
+                       cl::desc("Generate loop nests for tiling when expected "
+                                "number of operations exceeds threshold."));
 static cl::opt<bool> ForceFusion(
     "force-fuse-matrix", cl::init(false), cl::Hidden,
     cl::desc("Force matrix instruction fusion even if not profitable."));
@@ -612,6 +613,24 @@ class LowerMatrixIntrinsics {
                                 .getFixedValue()));
   }
 
+  /// Estimate the number of native vector operations for a multiply of matrices
+  /// with dimensions \p R x \p M and \p M x \p C. Native ops are computed as
+  /// ceil(ElementCount * ElementBits / RegisterBits).
+  ///
+  /// Native vector ops per operation type (VF = native vector elements):
+  ///   FMAs:    C * ceil(R/VF) * M (one FMA per VF output elements)
+  ///   A loads: ceil(R/VF) * M (A has M columns, ceil(R/VF) native loads each)
+  ///   B loads: ceil(M/VF) * C (B has C columns, ceil(M/VF) native loads each)
+  ///   Stores:  C * ceil(R/VF) (one store per VF output elements)
+  unsigned getNumNativeVectorOps(Type *EltType, unsigned R, unsigned M,
+                                 unsigned C) {
+    unsigned NumFMAs = C * getNumOps(EltType, R) * M;
+    unsigned NumALoads = getNumOps(EltType, R) * M;
+    unsigned NumBLoads = getNumOps(EltType, M) * C;
+    unsigned NumStores = getNumOps(EltType, R) * C;
+    return NumFMAs + NumALoads + NumBLoads + NumStores;
+  }
+
   /// Return the set of vectors that a matrix value is lowered to.
   ///
   /// If we lowered \p MatrixVal, just return the cache result matrix. Otherwise
@@ -2057,7 +2076,12 @@ class LowerMatrixIntrinsics {
     Value *BPtr = getNonAliasingPointer(LoadOp1, Store, MatMul);
     Value *CPtr = Store->getPointerOperand();
 
-    if (TileUseLoops && (R % TileSize == 0 && C % TileSize == 0))
+    // Use loop-based tiling when the number of expected operations exceeds
+    // threshold.
+    unsigned NumOps = getNumNativeVectorOps(EltType, R, M, C);
+    bool UseLoops =
+        (NumOps > TileLoopsThreshold) && R % TileSize == 0 && C % TileSize == 0;
+    if (UseLoops)
       createTiledLoops(MatMul, APtr, LShape, BPtr, RShape, Store);
     else {
       IRBuilder<> Builder(Store);

diff  --git a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
index abd1d96937b28..12a833bca521d 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:64:64' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR64
-; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:32:32' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR32
+; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:64:64' -fuse-matrix-loops-threshold=9999 -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR64
+; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:32:32' -fuse-matrix-loops-threshold=9999 -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR32
 
 ; REQUIRES: aarch64-registered-target
 

diff  --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll
index a6308c5a97333..ee42b10a03fbc 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=lower-matrix-intrinsics,instcombine -fuse-matrix-use-loops=false -fuse-matrix-tile-size=1 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s
+; RUN: opt -passes=lower-matrix-intrinsics,instcombine -fuse-matrix-loops-threshold=999999 -fuse-matrix-tile-size=1 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s
 
 ; REQUIRES: aarch64-registered-target
 

diff  --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops-large-matrixes.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops-large-matrixes.ll
index ae10f20dd6e8a..868dfec92209a 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops-large-matrixes.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops-large-matrixes.ll
@@ -959,2215 +959,82 @@ entry:
 define void @multiply_8x8x8(ptr noalias %A, ptr noalias %B, ptr noalias %C) {
 ; CHECK-LABEL: define void @multiply_8x8x8(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
-; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 8
-; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0
-; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 8
-; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
-; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul contract <2 x double> [[BLOCK]], [[SPLAT_SPLAT]]
-; CHECK-NEXT:    [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT6]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK5]], <2 x double> [[SPLAT_SPLAT7]], <2 x double> [[TMP3]])
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP6]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK8:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT9:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT9]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = fmul contract <2 x double> [[BLOCK8]], [[SPLAT_SPLAT10]]
-; CHECK-NEXT:    [[BLOCK11:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT12:%.*]] = insertelement <2 x double> poison, double [[TMP10]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT13:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT12]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK11]], <2 x double> [[SPLAT_SPLAT13]], <2 x double> [[TMP9]])
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP12]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr double, ptr [[A]], i64 16
-; CHECK-NEXT:    [[COL_LOAD14:%.*]] = load <2 x double>, ptr [[TMP14]], align 8
-; CHECK-NEXT:    [[VEC_GEP15:%.*]] = getelementptr double, ptr [[TMP14]], i64 8
-; CHECK-NEXT:    [[COL_LOAD16:%.*]] = load <2 x double>, ptr [[VEC_GEP15]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr double, ptr [[B]], i64 2
-; CHECK-NEXT:    [[COL_LOAD17:%.*]] = load <2 x double>, ptr [[TMP15]], align 8
-; CHECK-NEXT:    [[VEC_GEP18:%.*]] = getelementptr double, ptr [[TMP15]], i64 8
-; CHECK-NEXT:    [[COL_LOAD19:%.*]] = load <2 x double>, ptr [[VEC_GEP18]], align 8
-; CHECK-NEXT:    [[BLOCK20:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK21:%.*]] = shufflevector <2 x double> [[COL_LOAD14]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD17]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT22:%.*]] = insertelement <2 x double> poison, double [[TMP16]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT23:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT22]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK21]], <2 x double> [[SPLAT_SPLAT23]], <2 x double> [[BLOCK20]])
-; CHECK-NEXT:    [[BLOCK24:%.*]] = shufflevector <2 x double> [[COL_LOAD16]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[COL_LOAD17]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT25:%.*]] = insertelement <2 x double> poison, double [[TMP18]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT26:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT25]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK24]], <2 x double> [[SPLAT_SPLAT26]], <2 x double> [[TMP17]])
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP20]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK27:%.*]] = shufflevector <2 x double> [[TMP13]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK28:%.*]] = shufflevector <2 x double> [[COL_LOAD14]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[COL_LOAD19]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT29:%.*]] = insertelement <2 x double> poison, double [[TMP22]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT29]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP23:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK28]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[BLOCK27]])
-; CHECK-NEXT:    [[BLOCK31:%.*]] = shufflevector <2 x double> [[COL_LOAD16]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x double> [[COL_LOAD19]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT32:%.*]] = insertelement <2 x double> poison, double [[TMP24]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT32]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK31]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP23]])
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <2 x double> [[TMP25]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <2 x double> [[TMP13]], <2 x double> [[TMP26]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr double, ptr [[A]], i64 32
-; CHECK-NEXT:    [[COL_LOAD34:%.*]] = load <2 x double>, ptr [[TMP28]], align 8
-; CHECK-NEXT:    [[VEC_GEP35:%.*]] = getelementptr double, ptr [[TMP28]], i64 8
-; CHECK-NEXT:    [[COL_LOAD36:%.*]] = load <2 x double>, ptr [[VEC_GEP35]], align 8
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr double, ptr [[B]], i64 4
-; CHECK-NEXT:    [[COL_LOAD37:%.*]] = load <2 x double>, ptr [[TMP29]], align 8
-; CHECK-NEXT:    [[VEC_GEP38:%.*]] = getelementptr double, ptr [[TMP29]], i64 8
-; CHECK-NEXT:    [[COL_LOAD39:%.*]] = load <2 x double>, ptr [[VEC_GEP38]], align 8
-; CHECK-NEXT:    [[BLOCK40:%.*]] = shufflevector <2 x double> [[TMP21]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK41:%.*]] = shufflevector <2 x double> [[COL_LOAD34]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <2 x double> [[COL_LOAD37]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT42:%.*]] = insertelement <2 x double> poison, double [[TMP30]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT43:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT42]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP31:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK41]], <2 x double> [[SPLAT_SPLAT43]], <2 x double> [[BLOCK40]])
-; CHECK-NEXT:    [[BLOCK44:%.*]] = shufflevector <2 x double> [[COL_LOAD36]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <2 x double> [[COL_LOAD37]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT45:%.*]] = insertelement <2 x double> poison, double [[TMP32]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT46:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT45]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP33:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK44]], <2 x double> [[SPLAT_SPLAT46]], <2 x double> [[TMP31]])
-; CHECK-NEXT:    [[TMP34:%.*]] = shufflevector <2 x double> [[TMP33]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <2 x double> [[TMP21]], <2 x double> [[TMP34]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK47:%.*]] = shufflevector <2 x double> [[TMP27]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK48:%.*]] = shufflevector <2 x double> [[COL_LOAD34]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <2 x double> [[COL_LOAD39]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT49:%.*]] = insertelement <2 x double> poison, double [[TMP36]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT49]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK48]], <2 x double> [[SPLAT_SPLAT50]], <2 x double> [[BLOCK47]])
-; CHECK-NEXT:    [[BLOCK51:%.*]] = shufflevector <2 x double> [[COL_LOAD36]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <2 x double> [[COL_LOAD39]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT52:%.*]] = insertelement <2 x double> poison, double [[TMP38]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT53:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT52]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK51]], <2 x double> [[SPLAT_SPLAT53]], <2 x double> [[TMP37]])
-; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <2 x double> [[TMP39]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP41:%.*]] = shufflevector <2 x double> [[TMP27]], <2 x double> [[TMP40]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr double, ptr [[A]], i64 48
-; CHECK-NEXT:    [[COL_LOAD54:%.*]] = load <2 x double>, ptr [[TMP42]], align 8
-; CHECK-NEXT:    [[VEC_GEP55:%.*]] = getelementptr double, ptr [[TMP42]], i64 8
-; CHECK-NEXT:    [[COL_LOAD56:%.*]] = load <2 x double>, ptr [[VEC_GEP55]], align 8
-; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr double, ptr [[B]], i64 6
-; CHECK-NEXT:    [[COL_LOAD57:%.*]] = load <2 x double>, ptr [[TMP43]], align 8
-; CHECK-NEXT:    [[VEC_GEP58:%.*]] = getelementptr double, ptr [[TMP43]], i64 8
-; CHECK-NEXT:    [[COL_LOAD59:%.*]] = load <2 x double>, ptr [[VEC_GEP58]], align 8
-; CHECK-NEXT:    [[BLOCK60:%.*]] = shufflevector <2 x double> [[TMP35]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK61:%.*]] = shufflevector <2 x double> [[COL_LOAD54]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <2 x double> [[COL_LOAD57]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT62:%.*]] = insertelement <2 x double> poison, double [[TMP44]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT63:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT62]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP45:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK61]], <2 x double> [[SPLAT_SPLAT63]], <2 x double> [[BLOCK60]])
-; CHECK-NEXT:    [[BLOCK64:%.*]] = shufflevector <2 x double> [[COL_LOAD56]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <2 x double> [[COL_LOAD57]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT65:%.*]] = insertelement <2 x double> poison, double [[TMP46]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT66:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT65]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK64]], <2 x double> [[SPLAT_SPLAT66]], <2 x double> [[TMP45]])
-; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <2 x double> [[TMP47]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP49:%.*]] = shufflevector <2 x double> [[TMP35]], <2 x double> [[TMP48]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK67:%.*]] = shufflevector <2 x double> [[TMP41]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK68:%.*]] = shufflevector <2 x double> [[COL_LOAD54]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <2 x double> [[COL_LOAD59]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT69:%.*]] = insertelement <2 x double> poison, double [[TMP50]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT70:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT69]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP51:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK68]], <2 x double> [[SPLAT_SPLAT70]], <2 x double> [[BLOCK67]])
-; CHECK-NEXT:    [[BLOCK71:%.*]] = shufflevector <2 x double> [[COL_LOAD56]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <2 x double> [[COL_LOAD59]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT72:%.*]] = insertelement <2 x double> poison, double [[TMP52]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT73:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT72]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP53:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK71]], <2 x double> [[SPLAT_SPLAT73]], <2 x double> [[TMP51]])
-; CHECK-NEXT:    [[TMP54:%.*]] = shufflevector <2 x double> [[TMP53]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP55:%.*]] = shufflevector <2 x double> [[TMP41]], <2 x double> [[TMP54]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr double, ptr [[C]], i64 0
-; CHECK-NEXT:    store <2 x double> [[TMP49]], ptr [[TMP56]], align 8
-; CHECK-NEXT:    [[VEC_GEP74:%.*]] = getelementptr double, ptr [[TMP56]], i64 8
-; CHECK-NEXT:    store <2 x double> [[TMP55]], ptr [[VEC_GEP74]], align 8
-; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr double, ptr [[A]], i64 2
-; CHECK-NEXT:    [[COL_LOAD75:%.*]] = load <2 x double>, ptr [[TMP57]], align 8
-; CHECK-NEXT:    [[VEC_GEP76:%.*]] = getelementptr double, ptr [[TMP57]], i64 8
-; CHECK-NEXT:    [[COL_LOAD77:%.*]] = load <2 x double>, ptr [[VEC_GEP76]], align 8
-; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr double, ptr [[B]], i64 0
-; CHECK-NEXT:    [[COL_LOAD78:%.*]] = load <2 x double>, ptr [[TMP58]], align 8
-; CHECK-NEXT:    [[VEC_GEP79:%.*]] = getelementptr double, ptr [[TMP58]], i64 8
-; CHECK-NEXT:    [[COL_LOAD80:%.*]] = load <2 x double>, ptr [[VEC_GEP79]], align 8
-; CHECK-NEXT:    [[BLOCK81:%.*]] = shufflevector <2 x double> [[COL_LOAD75]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP59:%.*]] = extractelement <2 x double> [[COL_LOAD78]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT82:%.*]] = insertelement <2 x double> poison, double [[TMP59]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT83:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT82]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP60:%.*]] = fmul contract <2 x double> [[BLOCK81]], [[SPLAT_SPLAT83]]
-; CHECK-NEXT:    [[BLOCK84:%.*]] = shufflevector <2 x double> [[COL_LOAD77]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <2 x double> [[COL_LOAD78]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT85:%.*]] = insertelement <2 x double> poison, double [[TMP61]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT86:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT85]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP62:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK84]], <2 x double> [[SPLAT_SPLAT86]], <2 x double> [[TMP60]])
-; CHECK-NEXT:    [[TMP63:%.*]] = shufflevector <2 x double> [[TMP62]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP64:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP63]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK87:%.*]] = shufflevector <2 x double> [[COL_LOAD75]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <2 x double> [[COL_LOAD80]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT88:%.*]] = insertelement <2 x double> poison, double [[TMP65]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT89:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT88]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP66:%.*]] = fmul contract <2 x double> [[BLOCK87]], [[SPLAT_SPLAT89]]
-; CHECK-NEXT:    [[BLOCK90:%.*]] = shufflevector <2 x double> [[COL_LOAD77]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <2 x double> [[COL_LOAD80]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT91:%.*]] = insertelement <2 x double> poison, double [[TMP67]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT92:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT91]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP68:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK90]], <2 x double> [[SPLAT_SPLAT92]], <2 x double> [[TMP66]])
-; CHECK-NEXT:    [[TMP69:%.*]] = shufflevector <2 x double> [[TMP68]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP70:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP69]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr double, ptr [[A]], i64 18
-; CHECK-NEXT:    [[COL_LOAD93:%.*]] = load <2 x double>, ptr [[TMP71]], align 8
-; CHECK-NEXT:    [[VEC_GEP94:%.*]] = getelementptr double, ptr [[TMP71]], i64 8
-; CHECK-NEXT:    [[COL_LOAD95:%.*]] = load <2 x double>, ptr [[VEC_GEP94]], align 8
-; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr double, ptr [[B]], i64 2
-; CHECK-NEXT:    [[COL_LOAD96:%.*]] = load <2 x double>, ptr [[TMP72]], align 8
-; CHECK-NEXT:    [[VEC_GEP97:%.*]] = getelementptr double, ptr [[TMP72]], i64 8
-; CHECK-NEXT:    [[COL_LOAD98:%.*]] = load <2 x double>, ptr [[VEC_GEP97]], align 8
-; CHECK-NEXT:    [[BLOCK99:%.*]] = shufflevector <2 x double> [[TMP64]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK100:%.*]] = shufflevector <2 x double> [[COL_LOAD93]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <2 x double> [[COL_LOAD96]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT101:%.*]] = insertelement <2 x double> poison, double [[TMP73]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT102:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT101]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP74:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK100]], <2 x double> [[SPLAT_SPLAT102]], <2 x double> [[BLOCK99]])
-; CHECK-NEXT:    [[BLOCK103:%.*]] = shufflevector <2 x double> [[COL_LOAD95]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <2 x double> [[COL_LOAD96]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT104:%.*]] = insertelement <2 x double> poison, double [[TMP75]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT105:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT104]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP76:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK103]], <2 x double> [[SPLAT_SPLAT105]], <2 x double> [[TMP74]])
-; CHECK-NEXT:    [[TMP77:%.*]] = shufflevector <2 x double> [[TMP76]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP78:%.*]] = shufflevector <2 x double> [[TMP64]], <2 x double> [[TMP77]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK106:%.*]] = shufflevector <2 x double> [[TMP70]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK107:%.*]] = shufflevector <2 x double> [[COL_LOAD93]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <2 x double> [[COL_LOAD98]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT108:%.*]] = insertelement <2 x double> poison, double [[TMP79]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT109:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT108]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP80:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK107]], <2 x double> [[SPLAT_SPLAT109]], <2 x double> [[BLOCK106]])
-; CHECK-NEXT:    [[BLOCK110:%.*]] = shufflevector <2 x double> [[COL_LOAD95]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <2 x double> [[COL_LOAD98]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT111:%.*]] = insertelement <2 x double> poison, double [[TMP81]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT112:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT111]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP82:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK110]], <2 x double> [[SPLAT_SPLAT112]], <2 x double> [[TMP80]])
-; CHECK-NEXT:    [[TMP83:%.*]] = shufflevector <2 x double> [[TMP82]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP84:%.*]] = shufflevector <2 x double> [[TMP70]], <2 x double> [[TMP83]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP85:%.*]] = getelementptr double, ptr [[A]], i64 34
-; CHECK-NEXT:    [[COL_LOAD113:%.*]] = load <2 x double>, ptr [[TMP85]], align 8
-; CHECK-NEXT:    [[VEC_GEP114:%.*]] = getelementptr double, ptr [[TMP85]], i64 8
-; CHECK-NEXT:    [[COL_LOAD115:%.*]] = load <2 x double>, ptr [[VEC_GEP114]], align 8
-; CHECK-NEXT:    [[TMP86:%.*]] = getelementptr double, ptr [[B]], i64 4
-; CHECK-NEXT:    [[COL_LOAD116:%.*]] = load <2 x double>, ptr [[TMP86]], align 8
-; CHECK-NEXT:    [[VEC_GEP117:%.*]] = getelementptr double, ptr [[TMP86]], i64 8
-; CHECK-NEXT:    [[COL_LOAD118:%.*]] = load <2 x double>, ptr [[VEC_GEP117]], align 8
-; CHECK-NEXT:    [[BLOCK119:%.*]] = shufflevector <2 x double> [[TMP78]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK120:%.*]] = shufflevector <2 x double> [[COL_LOAD113]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP87:%.*]] = extractelement <2 x double> [[COL_LOAD116]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT121:%.*]] = insertelement <2 x double> poison, double [[TMP87]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT122:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT121]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP88:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK120]], <2 x double> [[SPLAT_SPLAT122]], <2 x double> [[BLOCK119]])
-; CHECK-NEXT:    [[BLOCK123:%.*]] = shufflevector <2 x double> [[COL_LOAD115]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <2 x double> [[COL_LOAD116]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT124:%.*]] = insertelement <2 x double> poison, double [[TMP89]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT125:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT124]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP90:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK123]], <2 x double> [[SPLAT_SPLAT125]], <2 x double> [[TMP88]])
-; CHECK-NEXT:    [[TMP91:%.*]] = shufflevector <2 x double> [[TMP90]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP92:%.*]] = shufflevector <2 x double> [[TMP78]], <2 x double> [[TMP91]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK126:%.*]] = shufflevector <2 x double> [[TMP84]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK127:%.*]] = shufflevector <2 x double> [[COL_LOAD113]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <2 x double> [[COL_LOAD118]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT128:%.*]] = insertelement <2 x double> poison, double [[TMP93]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT129:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT128]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP94:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK127]], <2 x double> [[SPLAT_SPLAT129]], <2 x double> [[BLOCK126]])
-; CHECK-NEXT:    [[BLOCK130:%.*]] = shufflevector <2 x double> [[COL_LOAD115]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP95:%.*]] = extractelement <2 x double> [[COL_LOAD118]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT131:%.*]] = insertelement <2 x double> poison, double [[TMP95]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT132:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT131]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP96:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK130]], <2 x double> [[SPLAT_SPLAT132]], <2 x double> [[TMP94]])
-; CHECK-NEXT:    [[TMP97:%.*]] = shufflevector <2 x double> [[TMP96]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP98:%.*]] = shufflevector <2 x double> [[TMP84]], <2 x double> [[TMP97]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP99:%.*]] = getelementptr double, ptr [[A]], i64 50
-; CHECK-NEXT:    [[COL_LOAD133:%.*]] = load <2 x double>, ptr [[TMP99]], align 8
-; CHECK-NEXT:    [[VEC_GEP134:%.*]] = getelementptr double, ptr [[TMP99]], i64 8
-; CHECK-NEXT:    [[COL_LOAD135:%.*]] = load <2 x double>, ptr [[VEC_GEP134]], align 8
-; CHECK-NEXT:    [[TMP100:%.*]] = getelementptr double, ptr [[B]], i64 6
-; CHECK-NEXT:    [[COL_LOAD136:%.*]] = load <2 x double>, ptr [[TMP100]], align 8
-; CHECK-NEXT:    [[VEC_GEP137:%.*]] = getelementptr double, ptr [[TMP100]], i64 8
-; CHECK-NEXT:    [[COL_LOAD138:%.*]] = load <2 x double>, ptr [[VEC_GEP137]], align 8
-; CHECK-NEXT:    [[BLOCK139:%.*]] = shufflevector <2 x double> [[TMP92]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK140:%.*]] = shufflevector <2 x double> [[COL_LOAD133]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP101:%.*]] = extractelement <2 x double> [[COL_LOAD136]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT141:%.*]] = insertelement <2 x double> poison, double [[TMP101]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT142:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT141]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP102:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK140]], <2 x double> [[SPLAT_SPLAT142]], <2 x double> [[BLOCK139]])
-; CHECK-NEXT:    [[BLOCK143:%.*]] = shufflevector <2 x double> [[COL_LOAD135]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP103:%.*]] = extractelement <2 x double> [[COL_LOAD136]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT144:%.*]] = insertelement <2 x double> poison, double [[TMP103]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT145:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT144]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP104:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK143]], <2 x double> [[SPLAT_SPLAT145]], <2 x double> [[TMP102]])
-; CHECK-NEXT:    [[TMP105:%.*]] = shufflevector <2 x double> [[TMP104]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP106:%.*]] = shufflevector <2 x double> [[TMP92]], <2 x double> [[TMP105]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK146:%.*]] = shufflevector <2 x double> [[TMP98]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK147:%.*]] = shufflevector <2 x double> [[COL_LOAD133]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP107:%.*]] = extractelement <2 x double> [[COL_LOAD138]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT148:%.*]] = insertelement <2 x double> poison, double [[TMP107]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT149:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT148]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP108:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK147]], <2 x double> [[SPLAT_SPLAT149]], <2 x double> [[BLOCK146]])
-; CHECK-NEXT:    [[BLOCK150:%.*]] = shufflevector <2 x double> [[COL_LOAD135]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP109:%.*]] = extractelement <2 x double> [[COL_LOAD138]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT151:%.*]] = insertelement <2 x double> poison, double [[TMP109]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT152:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT151]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP110:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK150]], <2 x double> [[SPLAT_SPLAT152]], <2 x double> [[TMP108]])
-; CHECK-NEXT:    [[TMP111:%.*]] = shufflevector <2 x double> [[TMP110]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP112:%.*]] = shufflevector <2 x double> [[TMP98]], <2 x double> [[TMP111]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP113:%.*]] = getelementptr double, ptr [[C]], i64 2
-; CHECK-NEXT:    store <2 x double> [[TMP106]], ptr [[TMP113]], align 8
-; CHECK-NEXT:    [[VEC_GEP153:%.*]] = getelementptr double, ptr [[TMP113]], i64 8
-; CHECK-NEXT:    store <2 x double> [[TMP112]], ptr [[VEC_GEP153]], align 8
-; CHECK-NEXT:    [[TMP114:%.*]] = getelementptr double, ptr [[A]], i64 4
-; CHECK-NEXT:    [[COL_LOAD154:%.*]] = load <2 x double>, ptr [[TMP114]], align 8
-; CHECK-NEXT:    [[VEC_GEP155:%.*]] = getelementptr double, ptr [[TMP114]], i64 8
-; CHECK-NEXT:    [[COL_LOAD156:%.*]] = load <2 x double>, ptr [[VEC_GEP155]], align 8
-; CHECK-NEXT:    [[TMP115:%.*]] = getelementptr double, ptr [[B]], i64 0
-; CHECK-NEXT:    [[COL_LOAD157:%.*]] = load <2 x double>, ptr [[TMP115]], align 8
-; CHECK-NEXT:    [[VEC_GEP158:%.*]] = getelementptr double, ptr [[TMP115]], i64 8
-; CHECK-NEXT:    [[COL_LOAD159:%.*]] = load <2 x double>, ptr [[VEC_GEP158]], align 8
-; CHECK-NEXT:    [[BLOCK160:%.*]] = shufflevector <2 x double> [[COL_LOAD154]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP116:%.*]] = extractelement <2 x double> [[COL_LOAD157]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT161:%.*]] = insertelement <2 x double> poison, double [[TMP116]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT162:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT161]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP117:%.*]] = fmul contract <2 x double> [[BLOCK160]], [[SPLAT_SPLAT162]]
-; CHECK-NEXT:    [[BLOCK163:%.*]] = shufflevector <2 x double> [[COL_LOAD156]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP118:%.*]] = extractelement <2 x double> [[COL_LOAD157]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT164:%.*]] = insertelement <2 x double> poison, double [[TMP118]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT165:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT164]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP119:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK163]], <2 x double> [[SPLAT_SPLAT165]], <2 x double> [[TMP117]])
-; CHECK-NEXT:    [[TMP120:%.*]] = shufflevector <2 x double> [[TMP119]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP121:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP120]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK166:%.*]] = shufflevector <2 x double> [[COL_LOAD154]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP122:%.*]] = extractelement <2 x double> [[COL_LOAD159]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT167:%.*]] = insertelement <2 x double> poison, double [[TMP122]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT168:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT167]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP123:%.*]] = fmul contract <2 x double> [[BLOCK166]], [[SPLAT_SPLAT168]]
-; CHECK-NEXT:    [[BLOCK169:%.*]] = shufflevector <2 x double> [[COL_LOAD156]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP124:%.*]] = extractelement <2 x double> [[COL_LOAD159]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT170:%.*]] = insertelement <2 x double> poison, double [[TMP124]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT171:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT170]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP125:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK169]], <2 x double> [[SPLAT_SPLAT171]], <2 x double> [[TMP123]])
-; CHECK-NEXT:    [[TMP126:%.*]] = shufflevector <2 x double> [[TMP125]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP127:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP126]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP128:%.*]] = getelementptr double, ptr [[A]], i64 20
-; CHECK-NEXT:    [[COL_LOAD172:%.*]] = load <2 x double>, ptr [[TMP128]], align 8
-; CHECK-NEXT:    [[VEC_GEP173:%.*]] = getelementptr double, ptr [[TMP128]], i64 8
-; CHECK-NEXT:    [[COL_LOAD174:%.*]] = load <2 x double>, ptr [[VEC_GEP173]], align 8
-; CHECK-NEXT:    [[TMP129:%.*]] = getelementptr double, ptr [[B]], i64 2
-; CHECK-NEXT:    [[COL_LOAD175:%.*]] = load <2 x double>, ptr [[TMP129]], align 8
-; CHECK-NEXT:    [[VEC_GEP176:%.*]] = getelementptr double, ptr [[TMP129]], i64 8
-; CHECK-NEXT:    [[COL_LOAD177:%.*]] = load <2 x double>, ptr [[VEC_GEP176]], align 8
-; CHECK-NEXT:    [[BLOCK178:%.*]] = shufflevector <2 x double> [[TMP121]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK179:%.*]] = shufflevector <2 x double> [[COL_LOAD172]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP130:%.*]] = extractelement <2 x double> [[COL_LOAD175]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT180:%.*]] = insertelement <2 x double> poison, double [[TMP130]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT181:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT180]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP131:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK179]], <2 x double> [[SPLAT_SPLAT181]], <2 x double> [[BLOCK178]])
-; CHECK-NEXT:    [[BLOCK182:%.*]] = shufflevector <2 x double> [[COL_LOAD174]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <2 x double> [[COL_LOAD175]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT183:%.*]] = insertelement <2 x double> poison, double [[TMP132]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT184:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT183]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP133:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK182]], <2 x double> [[SPLAT_SPLAT184]], <2 x double> [[TMP131]])
-; CHECK-NEXT:    [[TMP134:%.*]] = shufflevector <2 x double> [[TMP133]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP135:%.*]] = shufflevector <2 x double> [[TMP121]], <2 x double> [[TMP134]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK185:%.*]] = shufflevector <2 x double> [[TMP127]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK186:%.*]] = shufflevector <2 x double> [[COL_LOAD172]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP136:%.*]] = extractelement <2 x double> [[COL_LOAD177]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT187:%.*]] = insertelement <2 x double> poison, double [[TMP136]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT188:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT187]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP137:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK186]], <2 x double> [[SPLAT_SPLAT188]], <2 x double> [[BLOCK185]])
-; CHECK-NEXT:    [[BLOCK189:%.*]] = shufflevector <2 x double> [[COL_LOAD174]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP138:%.*]] = extractelement <2 x double> [[COL_LOAD177]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT190:%.*]] = insertelement <2 x double> poison, double [[TMP138]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT191:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT190]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP139:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK189]], <2 x double> [[SPLAT_SPLAT191]], <2 x double> [[TMP137]])
-; CHECK-NEXT:    [[TMP140:%.*]] = shufflevector <2 x double> [[TMP139]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP141:%.*]] = shufflevector <2 x double> [[TMP127]], <2 x double> [[TMP140]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP142:%.*]] = getelementptr double, ptr [[A]], i64 36
-; CHECK-NEXT:    [[COL_LOAD192:%.*]] = load <2 x double>, ptr [[TMP142]], align 8
-; CHECK-NEXT:    [[VEC_GEP193:%.*]] = getelementptr double, ptr [[TMP142]], i64 8
-; CHECK-NEXT:    [[COL_LOAD194:%.*]] = load <2 x double>, ptr [[VEC_GEP193]], align 8
-; CHECK-NEXT:    [[TMP143:%.*]] = getelementptr double, ptr [[B]], i64 4
-; CHECK-NEXT:    [[COL_LOAD195:%.*]] = load <2 x double>, ptr [[TMP143]], align 8
-; CHECK-NEXT:    [[VEC_GEP196:%.*]] = getelementptr double, ptr [[TMP143]], i64 8
-; CHECK-NEXT:    [[COL_LOAD197:%.*]] = load <2 x double>, ptr [[VEC_GEP196]], align 8
-; CHECK-NEXT:    [[BLOCK198:%.*]] = shufflevector <2 x double> [[TMP135]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK199:%.*]] = shufflevector <2 x double> [[COL_LOAD192]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP144:%.*]] = extractelement <2 x double> [[COL_LOAD195]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT200:%.*]] = insertelement <2 x double> poison, double [[TMP144]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT201:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT200]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP145:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK199]], <2 x double> [[SPLAT_SPLAT201]], <2 x double> [[BLOCK198]])
-; CHECK-NEXT:    [[BLOCK202:%.*]] = shufflevector <2 x double> [[COL_LOAD194]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP146:%.*]] = extractelement <2 x double> [[COL_LOAD195]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT203:%.*]] = insertelement <2 x double> poison, double [[TMP146]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT204:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT203]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP147:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK202]], <2 x double> [[SPLAT_SPLAT204]], <2 x double> [[TMP145]])
-; CHECK-NEXT:    [[TMP148:%.*]] = shufflevector <2 x double> [[TMP147]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP149:%.*]] = shufflevector <2 x double> [[TMP135]], <2 x double> [[TMP148]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK205:%.*]] = shufflevector <2 x double> [[TMP141]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK206:%.*]] = shufflevector <2 x double> [[COL_LOAD192]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP150:%.*]] = extractelement <2 x double> [[COL_LOAD197]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT207:%.*]] = insertelement <2 x double> poison, double [[TMP150]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT208:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT207]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP151:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK206]], <2 x double> [[SPLAT_SPLAT208]], <2 x double> [[BLOCK205]])
-; CHECK-NEXT:    [[BLOCK209:%.*]] = shufflevector <2 x double> [[COL_LOAD194]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP152:%.*]] = extractelement <2 x double> [[COL_LOAD197]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT210:%.*]] = insertelement <2 x double> poison, double [[TMP152]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT211:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT210]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP153:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK209]], <2 x double> [[SPLAT_SPLAT211]], <2 x double> [[TMP151]])
-; CHECK-NEXT:    [[TMP154:%.*]] = shufflevector <2 x double> [[TMP153]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP155:%.*]] = shufflevector <2 x double> [[TMP141]], <2 x double> [[TMP154]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP156:%.*]] = getelementptr double, ptr [[A]], i64 52
-; CHECK-NEXT:    [[COL_LOAD212:%.*]] = load <2 x double>, ptr [[TMP156]], align 8
-; CHECK-NEXT:    [[VEC_GEP213:%.*]] = getelementptr double, ptr [[TMP156]], i64 8
-; CHECK-NEXT:    [[COL_LOAD214:%.*]] = load <2 x double>, ptr [[VEC_GEP213]], align 8
-; CHECK-NEXT:    [[TMP157:%.*]] = getelementptr double, ptr [[B]], i64 6
-; CHECK-NEXT:    [[COL_LOAD215:%.*]] = load <2 x double>, ptr [[TMP157]], align 8
-; CHECK-NEXT:    [[VEC_GEP216:%.*]] = getelementptr double, ptr [[TMP157]], i64 8
-; CHECK-NEXT:    [[COL_LOAD217:%.*]] = load <2 x double>, ptr [[VEC_GEP216]], align 8
-; CHECK-NEXT:    [[BLOCK218:%.*]] = shufflevector <2 x double> [[TMP149]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK219:%.*]] = shufflevector <2 x double> [[COL_LOAD212]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP158:%.*]] = extractelement <2 x double> [[COL_LOAD215]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT220:%.*]] = insertelement <2 x double> poison, double [[TMP158]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT221:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT220]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP159:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK219]], <2 x double> [[SPLAT_SPLAT221]], <2 x double> [[BLOCK218]])
-; CHECK-NEXT:    [[BLOCK222:%.*]] = shufflevector <2 x double> [[COL_LOAD214]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP160:%.*]] = extractelement <2 x double> [[COL_LOAD215]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT223:%.*]] = insertelement <2 x double> poison, double [[TMP160]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT224:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT223]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP161:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK222]], <2 x double> [[SPLAT_SPLAT224]], <2 x double> [[TMP159]])
-; CHECK-NEXT:    [[TMP162:%.*]] = shufflevector <2 x double> [[TMP161]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP163:%.*]] = shufflevector <2 x double> [[TMP149]], <2 x double> [[TMP162]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK225:%.*]] = shufflevector <2 x double> [[TMP155]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK226:%.*]] = shufflevector <2 x double> [[COL_LOAD212]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP164:%.*]] = extractelement <2 x double> [[COL_LOAD217]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT227:%.*]] = insertelement <2 x double> poison, double [[TMP164]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT228:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT227]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP165:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK226]], <2 x double> [[SPLAT_SPLAT228]], <2 x double> [[BLOCK225]])
-; CHECK-NEXT:    [[BLOCK229:%.*]] = shufflevector <2 x double> [[COL_LOAD214]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP166:%.*]] = extractelement <2 x double> [[COL_LOAD217]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT230:%.*]] = insertelement <2 x double> poison, double [[TMP166]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT231:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT230]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP167:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK229]], <2 x double> [[SPLAT_SPLAT231]], <2 x double> [[TMP165]])
-; CHECK-NEXT:    [[TMP168:%.*]] = shufflevector <2 x double> [[TMP167]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP169:%.*]] = shufflevector <2 x double> [[TMP155]], <2 x double> [[TMP168]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP170:%.*]] = getelementptr double, ptr [[C]], i64 4
-; CHECK-NEXT:    store <2 x double> [[TMP163]], ptr [[TMP170]], align 8
-; CHECK-NEXT:    [[VEC_GEP232:%.*]] = getelementptr double, ptr [[TMP170]], i64 8
-; CHECK-NEXT:    store <2 x double> [[TMP169]], ptr [[VEC_GEP232]], align 8
-; CHECK-NEXT:    [[TMP171:%.*]] = getelementptr double, ptr [[A]], i64 6
-; CHECK-NEXT:    [[COL_LOAD233:%.*]] = load <2 x double>, ptr [[TMP171]], align 8
-; CHECK-NEXT:    [[VEC_GEP234:%.*]] = getelementptr double, ptr [[TMP171]], i64 8
-; CHECK-NEXT:    [[COL_LOAD235:%.*]] = load <2 x double>, ptr [[VEC_GEP234]], align 8
-; CHECK-NEXT:    [[TMP172:%.*]] = getelementptr double, ptr [[B]], i64 0
-; CHECK-NEXT:    [[COL_LOAD236:%.*]] = load <2 x double>, ptr [[TMP172]], align 8
-; CHECK-NEXT:    [[VEC_GEP237:%.*]] = getelementptr double, ptr [[TMP172]], i64 8
-; CHECK-NEXT:    [[COL_LOAD238:%.*]] = load <2 x double>, ptr [[VEC_GEP237]], align 8
-; CHECK-NEXT:    [[BLOCK239:%.*]] = shufflevector <2 x double> [[COL_LOAD233]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP173:%.*]] = extractelement <2 x double> [[COL_LOAD236]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT240:%.*]] = insertelement <2 x double> poison, double [[TMP173]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT241:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT240]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP174:%.*]] = fmul contract <2 x double> [[BLOCK239]], [[SPLAT_SPLAT241]]
-; CHECK-NEXT:    [[BLOCK242:%.*]] = shufflevector <2 x double> [[COL_LOAD235]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP175:%.*]] = extractelement <2 x double> [[COL_LOAD236]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT243:%.*]] = insertelement <2 x double> poison, double [[TMP175]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT244:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT243]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP176:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK242]], <2 x double> [[SPLAT_SPLAT244]], <2 x double> [[TMP174]])
-; CHECK-NEXT:    [[TMP177:%.*]] = shufflevector <2 x double> [[TMP176]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP178:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP177]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK245:%.*]] = shufflevector <2 x double> [[COL_LOAD233]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP179:%.*]] = extractelement <2 x double> [[COL_LOAD238]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT246:%.*]] = insertelement <2 x double> poison, double [[TMP179]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT247:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT246]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP180:%.*]] = fmul contract <2 x double> [[BLOCK245]], [[SPLAT_SPLAT247]]
-; CHECK-NEXT:    [[BLOCK248:%.*]] = shufflevector <2 x double> [[COL_LOAD235]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP181:%.*]] = extractelement <2 x double> [[COL_LOAD238]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT249:%.*]] = insertelement <2 x double> poison, double [[TMP181]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT250:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT249]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP182:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK248]], <2 x double> [[SPLAT_SPLAT250]], <2 x double> [[TMP180]])
-; CHECK-NEXT:    [[TMP183:%.*]] = shufflevector <2 x double> [[TMP182]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP184:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP183]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP185:%.*]] = getelementptr double, ptr [[A]], i64 22
-; CHECK-NEXT:    [[COL_LOAD251:%.*]] = load <2 x double>, ptr [[TMP185]], align 8
-; CHECK-NEXT:    [[VEC_GEP252:%.*]] = getelementptr double, ptr [[TMP185]], i64 8
-; CHECK-NEXT:    [[COL_LOAD253:%.*]] = load <2 x double>, ptr [[VEC_GEP252]], align 8
-; CHECK-NEXT:    [[TMP186:%.*]] = getelementptr double, ptr [[B]], i64 2
-; CHECK-NEXT:    [[COL_LOAD254:%.*]] = load <2 x double>, ptr [[TMP186]], align 8
-; CHECK-NEXT:    [[VEC_GEP255:%.*]] = getelementptr double, ptr [[TMP186]], i64 8
-; CHECK-NEXT:    [[COL_LOAD256:%.*]] = load <2 x double>, ptr [[VEC_GEP255]], align 8
-; CHECK-NEXT:    [[BLOCK257:%.*]] = shufflevector <2 x double> [[TMP178]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK258:%.*]] = shufflevector <2 x double> [[COL_LOAD251]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP187:%.*]] = extractelement <2 x double> [[COL_LOAD254]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT259:%.*]] = insertelement <2 x double> poison, double [[TMP187]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT260:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT259]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP188:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK258]], <2 x double> [[SPLAT_SPLAT260]], <2 x double> [[BLOCK257]])
-; CHECK-NEXT:    [[BLOCK261:%.*]] = shufflevector <2 x double> [[COL_LOAD253]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP189:%.*]] = extractelement <2 x double> [[COL_LOAD254]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT262:%.*]] = insertelement <2 x double> poison, double [[TMP189]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT263:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT262]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP190:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK261]], <2 x double> [[SPLAT_SPLAT263]], <2 x double> [[TMP188]])
-; CHECK-NEXT:    [[TMP191:%.*]] = shufflevector <2 x double> [[TMP190]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP192:%.*]] = shufflevector <2 x double> [[TMP178]], <2 x double> [[TMP191]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK264:%.*]] = shufflevector <2 x double> [[TMP184]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK265:%.*]] = shufflevector <2 x double> [[COL_LOAD251]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP193:%.*]] = extractelement <2 x double> [[COL_LOAD256]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT266:%.*]] = insertelement <2 x double> poison, double [[TMP193]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT267:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT266]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP194:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK265]], <2 x double> [[SPLAT_SPLAT267]], <2 x double> [[BLOCK264]])
-; CHECK-NEXT:    [[BLOCK268:%.*]] = shufflevector <2 x double> [[COL_LOAD253]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP195:%.*]] = extractelement <2 x double> [[COL_LOAD256]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT269:%.*]] = insertelement <2 x double> poison, double [[TMP195]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT270:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT269]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP196:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK268]], <2 x double> [[SPLAT_SPLAT270]], <2 x double> [[TMP194]])
-; CHECK-NEXT:    [[TMP197:%.*]] = shufflevector <2 x double> [[TMP196]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP198:%.*]] = shufflevector <2 x double> [[TMP184]], <2 x double> [[TMP197]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP199:%.*]] = getelementptr double, ptr [[A]], i64 38
-; CHECK-NEXT:    [[COL_LOAD271:%.*]] = load <2 x double>, ptr [[TMP199]], align 8
-; CHECK-NEXT:    [[VEC_GEP272:%.*]] = getelementptr double, ptr [[TMP199]], i64 8
-; CHECK-NEXT:    [[COL_LOAD273:%.*]] = load <2 x double>, ptr [[VEC_GEP272]], align 8
-; CHECK-NEXT:    [[TMP200:%.*]] = getelementptr double, ptr [[B]], i64 4
-; CHECK-NEXT:    [[COL_LOAD274:%.*]] = load <2 x double>, ptr [[TMP200]], align 8
-; CHECK-NEXT:    [[VEC_GEP275:%.*]] = getelementptr double, ptr [[TMP200]], i64 8
-; CHECK-NEXT:    [[COL_LOAD276:%.*]] = load <2 x double>, ptr [[VEC_GEP275]], align 8
-; CHECK-NEXT:    [[BLOCK277:%.*]] = shufflevector <2 x double> [[TMP192]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK278:%.*]] = shufflevector <2 x double> [[COL_LOAD271]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP201:%.*]] = extractelement <2 x double> [[COL_LOAD274]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT279:%.*]] = insertelement <2 x double> poison, double [[TMP201]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT280:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT279]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP202:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK278]], <2 x double> [[SPLAT_SPLAT280]], <2 x double> [[BLOCK277]])
-; CHECK-NEXT:    [[BLOCK281:%.*]] = shufflevector <2 x double> [[COL_LOAD273]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP203:%.*]] = extractelement <2 x double> [[COL_LOAD274]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT282:%.*]] = insertelement <2 x double> poison, double [[TMP203]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT283:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT282]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP204:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK281]], <2 x double> [[SPLAT_SPLAT283]], <2 x double> [[TMP202]])
-; CHECK-NEXT:    [[TMP205:%.*]] = shufflevector <2 x double> [[TMP204]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP206:%.*]] = shufflevector <2 x double> [[TMP192]], <2 x double> [[TMP205]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK284:%.*]] = shufflevector <2 x double> [[TMP198]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK285:%.*]] = shufflevector <2 x double> [[COL_LOAD271]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP207:%.*]] = extractelement <2 x double> [[COL_LOAD276]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT286:%.*]] = insertelement <2 x double> poison, double [[TMP207]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT287:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT286]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP208:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK285]], <2 x double> [[SPLAT_SPLAT287]], <2 x double> [[BLOCK284]])
-; CHECK-NEXT:    [[BLOCK288:%.*]] = shufflevector <2 x double> [[COL_LOAD273]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP209:%.*]] = extractelement <2 x double> [[COL_LOAD276]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT289:%.*]] = insertelement <2 x double> poison, double [[TMP209]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT290:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT289]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP210:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK288]], <2 x double> [[SPLAT_SPLAT290]], <2 x double> [[TMP208]])
-; CHECK-NEXT:    [[TMP211:%.*]] = shufflevector <2 x double> [[TMP210]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP212:%.*]] = shufflevector <2 x double> [[TMP198]], <2 x double> [[TMP211]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP213:%.*]] = getelementptr double, ptr [[A]], i64 54
-; CHECK-NEXT:    [[COL_LOAD291:%.*]] = load <2 x double>, ptr [[TMP213]], align 8
-; CHECK-NEXT:    [[VEC_GEP292:%.*]] = getelementptr double, ptr [[TMP213]], i64 8
-; CHECK-NEXT:    [[COL_LOAD293:%.*]] = load <2 x double>, ptr [[VEC_GEP292]], align 8
-; CHECK-NEXT:    [[TMP214:%.*]] = getelementptr double, ptr [[B]], i64 6
-; CHECK-NEXT:    [[COL_LOAD294:%.*]] = load <2 x double>, ptr [[TMP214]], align 8
-; CHECK-NEXT:    [[VEC_GEP295:%.*]] = getelementptr double, ptr [[TMP214]], i64 8
-; CHECK-NEXT:    [[COL_LOAD296:%.*]] = load <2 x double>, ptr [[VEC_GEP295]], align 8
-; CHECK-NEXT:    [[BLOCK297:%.*]] = shufflevector <2 x double> [[TMP206]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK298:%.*]] = shufflevector <2 x double> [[COL_LOAD291]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP215:%.*]] = extractelement <2 x double> [[COL_LOAD294]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT299:%.*]] = insertelement <2 x double> poison, double [[TMP215]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT300:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT299]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP216:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK298]], <2 x double> [[SPLAT_SPLAT300]], <2 x double> [[BLOCK297]])
-; CHECK-NEXT:    [[BLOCK301:%.*]] = shufflevector <2 x double> [[COL_LOAD293]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP217:%.*]] = extractelement <2 x double> [[COL_LOAD294]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT302:%.*]] = insertelement <2 x double> poison, double [[TMP217]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT303:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT302]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP218:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK301]], <2 x double> [[SPLAT_SPLAT303]], <2 x double> [[TMP216]])
-; CHECK-NEXT:    [[TMP219:%.*]] = shufflevector <2 x double> [[TMP218]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP220:%.*]] = shufflevector <2 x double> [[TMP206]], <2 x double> [[TMP219]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK304:%.*]] = shufflevector <2 x double> [[TMP212]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK305:%.*]] = shufflevector <2 x double> [[COL_LOAD291]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP221:%.*]] = extractelement <2 x double> [[COL_LOAD296]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT306:%.*]] = insertelement <2 x double> poison, double [[TMP221]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT307:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT306]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP222:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK305]], <2 x double> [[SPLAT_SPLAT307]], <2 x double> [[BLOCK304]])
-; CHECK-NEXT:    [[BLOCK308:%.*]] = shufflevector <2 x double> [[COL_LOAD293]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP223:%.*]] = extractelement <2 x double> [[COL_LOAD296]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT309:%.*]] = insertelement <2 x double> poison, double [[TMP223]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT310:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT309]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP224:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK308]], <2 x double> [[SPLAT_SPLAT310]], <2 x double> [[TMP222]])
-; CHECK-NEXT:    [[TMP225:%.*]] = shufflevector <2 x double> [[TMP224]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP226:%.*]] = shufflevector <2 x double> [[TMP212]], <2 x double> [[TMP225]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP227:%.*]] = getelementptr double, ptr [[C]], i64 6
-; CHECK-NEXT:    store <2 x double> [[TMP220]], ptr [[TMP227]], align 8
-; CHECK-NEXT:    [[VEC_GEP311:%.*]] = getelementptr double, ptr [[TMP227]], i64 8
-; CHECK-NEXT:    store <2 x double> [[TMP226]], ptr [[VEC_GEP311]], align 8
-; CHECK-NEXT:    [[TMP228:%.*]] = getelementptr double, ptr [[A]], i64 0
-; CHECK-NEXT:    [[COL_LOAD312:%.*]] = load <2 x double>, ptr [[TMP228]], align 8
-; CHECK-NEXT:    [[VEC_GEP313:%.*]] = getelementptr double, ptr [[TMP228]], i64 8
-; CHECK-NEXT:    [[COL_LOAD314:%.*]] = load <2 x double>, ptr [[VEC_GEP313]], align 8
-; CHECK-NEXT:    [[TMP229:%.*]] = getelementptr double, ptr [[B]], i64 16
-; CHECK-NEXT:    [[COL_LOAD315:%.*]] = load <2 x double>, ptr [[TMP229]], align 8
-; CHECK-NEXT:    [[VEC_GEP316:%.*]] = getelementptr double, ptr [[TMP229]], i64 8
-; CHECK-NEXT:    [[COL_LOAD317:%.*]] = load <2 x double>, ptr [[VEC_GEP316]], align 8
-; CHECK-NEXT:    [[BLOCK318:%.*]] = shufflevector <2 x double> [[COL_LOAD312]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP230:%.*]] = extractelement <2 x double> [[COL_LOAD315]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT319:%.*]] = insertelement <2 x double> poison, double [[TMP230]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT320:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT319]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP231:%.*]] = fmul contract <2 x double> [[BLOCK318]], [[SPLAT_SPLAT320]]
-; CHECK-NEXT:    [[BLOCK321:%.*]] = shufflevector <2 x double> [[COL_LOAD314]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP232:%.*]] = extractelement <2 x double> [[COL_LOAD315]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT322:%.*]] = insertelement <2 x double> poison, double [[TMP232]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT323:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT322]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP233:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK321]], <2 x double> [[SPLAT_SPLAT323]], <2 x double> [[TMP231]])
-; CHECK-NEXT:    [[TMP234:%.*]] = shufflevector <2 x double> [[TMP233]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP235:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP234]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK324:%.*]] = shufflevector <2 x double> [[COL_LOAD312]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP236:%.*]] = extractelement <2 x double> [[COL_LOAD317]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT325:%.*]] = insertelement <2 x double> poison, double [[TMP236]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT326:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT325]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP237:%.*]] = fmul contract <2 x double> [[BLOCK324]], [[SPLAT_SPLAT326]]
-; CHECK-NEXT:    [[BLOCK327:%.*]] = shufflevector <2 x double> [[COL_LOAD314]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP238:%.*]] = extractelement <2 x double> [[COL_LOAD317]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT328:%.*]] = insertelement <2 x double> poison, double [[TMP238]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT329:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT328]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP239:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK327]], <2 x double> [[SPLAT_SPLAT329]], <2 x double> [[TMP237]])
-; CHECK-NEXT:    [[TMP240:%.*]] = shufflevector <2 x double> [[TMP239]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP241:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP240]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP242:%.*]] = getelementptr double, ptr [[A]], i64 16
-; CHECK-NEXT:    [[COL_LOAD330:%.*]] = load <2 x double>, ptr [[TMP242]], align 8
-; CHECK-NEXT:    [[VEC_GEP331:%.*]] = getelementptr double, ptr [[TMP242]], i64 8
-; CHECK-NEXT:    [[COL_LOAD332:%.*]] = load <2 x double>, ptr [[VEC_GEP331]], align 8
-; CHECK-NEXT:    [[TMP243:%.*]] = getelementptr double, ptr [[B]], i64 18
-; CHECK-NEXT:    [[COL_LOAD333:%.*]] = load <2 x double>, ptr [[TMP243]], align 8
-; CHECK-NEXT:    [[VEC_GEP334:%.*]] = getelementptr double, ptr [[TMP243]], i64 8
-; CHECK-NEXT:    [[COL_LOAD335:%.*]] = load <2 x double>, ptr [[VEC_GEP334]], align 8
-; CHECK-NEXT:    [[BLOCK336:%.*]] = shufflevector <2 x double> [[TMP235]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK337:%.*]] = shufflevector <2 x double> [[COL_LOAD330]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP244:%.*]] = extractelement <2 x double> [[COL_LOAD333]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT338:%.*]] = insertelement <2 x double> poison, double [[TMP244]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT339:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT338]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP245:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK337]], <2 x double> [[SPLAT_SPLAT339]], <2 x double> [[BLOCK336]])
-; CHECK-NEXT:    [[BLOCK340:%.*]] = shufflevector <2 x double> [[COL_LOAD332]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP246:%.*]] = extractelement <2 x double> [[COL_LOAD333]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT341:%.*]] = insertelement <2 x double> poison, double [[TMP246]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT342:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT341]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP247:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK340]], <2 x double> [[SPLAT_SPLAT342]], <2 x double> [[TMP245]])
-; CHECK-NEXT:    [[TMP248:%.*]] = shufflevector <2 x double> [[TMP247]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP249:%.*]] = shufflevector <2 x double> [[TMP235]], <2 x double> [[TMP248]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK343:%.*]] = shufflevector <2 x double> [[TMP241]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK344:%.*]] = shufflevector <2 x double> [[COL_LOAD330]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP250:%.*]] = extractelement <2 x double> [[COL_LOAD335]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT345:%.*]] = insertelement <2 x double> poison, double [[TMP250]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT346:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT345]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP251:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK344]], <2 x double> [[SPLAT_SPLAT346]], <2 x double> [[BLOCK343]])
-; CHECK-NEXT:    [[BLOCK347:%.*]] = shufflevector <2 x double> [[COL_LOAD332]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP252:%.*]] = extractelement <2 x double> [[COL_LOAD335]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT348:%.*]] = insertelement <2 x double> poison, double [[TMP252]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT349:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT348]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP253:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK347]], <2 x double> [[SPLAT_SPLAT349]], <2 x double> [[TMP251]])
-; CHECK-NEXT:    [[TMP254:%.*]] = shufflevector <2 x double> [[TMP253]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP255:%.*]] = shufflevector <2 x double> [[TMP241]], <2 x double> [[TMP254]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP256:%.*]] = getelementptr double, ptr [[A]], i64 32
-; CHECK-NEXT:    [[COL_LOAD350:%.*]] = load <2 x double>, ptr [[TMP256]], align 8
-; CHECK-NEXT:    [[VEC_GEP351:%.*]] = getelementptr double, ptr [[TMP256]], i64 8
-; CHECK-NEXT:    [[COL_LOAD352:%.*]] = load <2 x double>, ptr [[VEC_GEP351]], align 8
-; CHECK-NEXT:    [[TMP257:%.*]] = getelementptr double, ptr [[B]], i64 20
-; CHECK-NEXT:    [[COL_LOAD353:%.*]] = load <2 x double>, ptr [[TMP257]], align 8
-; CHECK-NEXT:    [[VEC_GEP354:%.*]] = getelementptr double, ptr [[TMP257]], i64 8
-; CHECK-NEXT:    [[COL_LOAD355:%.*]] = load <2 x double>, ptr [[VEC_GEP354]], align 8
-; CHECK-NEXT:    [[BLOCK356:%.*]] = shufflevector <2 x double> [[TMP249]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK357:%.*]] = shufflevector <2 x double> [[COL_LOAD350]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP258:%.*]] = extractelement <2 x double> [[COL_LOAD353]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT358:%.*]] = insertelement <2 x double> poison, double [[TMP258]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT359:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT358]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP259:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK357]], <2 x double> [[SPLAT_SPLAT359]], <2 x double> [[BLOCK356]])
-; CHECK-NEXT:    [[BLOCK360:%.*]] = shufflevector <2 x double> [[COL_LOAD352]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP260:%.*]] = extractelement <2 x double> [[COL_LOAD353]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT361:%.*]] = insertelement <2 x double> poison, double [[TMP260]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT362:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT361]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP261:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK360]], <2 x double> [[SPLAT_SPLAT362]], <2 x double> [[TMP259]])
-; CHECK-NEXT:    [[TMP262:%.*]] = shufflevector <2 x double> [[TMP261]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP263:%.*]] = shufflevector <2 x double> [[TMP249]], <2 x double> [[TMP262]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK363:%.*]] = shufflevector <2 x double> [[TMP255]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK364:%.*]] = shufflevector <2 x double> [[COL_LOAD350]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP264:%.*]] = extractelement <2 x double> [[COL_LOAD355]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT365:%.*]] = insertelement <2 x double> poison, double [[TMP264]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT366:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT365]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP265:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK364]], <2 x double> [[SPLAT_SPLAT366]], <2 x double> [[BLOCK363]])
-; CHECK-NEXT:    [[BLOCK367:%.*]] = shufflevector <2 x double> [[COL_LOAD352]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP266:%.*]] = extractelement <2 x double> [[COL_LOAD355]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT368:%.*]] = insertelement <2 x double> poison, double [[TMP266]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT369:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT368]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP267:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK367]], <2 x double> [[SPLAT_SPLAT369]], <2 x double> [[TMP265]])
-; CHECK-NEXT:    [[TMP268:%.*]] = shufflevector <2 x double> [[TMP267]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP269:%.*]] = shufflevector <2 x double> [[TMP255]], <2 x double> [[TMP268]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP270:%.*]] = getelementptr double, ptr [[A]], i64 48
-; CHECK-NEXT:    [[COL_LOAD370:%.*]] = load <2 x double>, ptr [[TMP270]], align 8
-; CHECK-NEXT:    [[VEC_GEP371:%.*]] = getelementptr double, ptr [[TMP270]], i64 8
-; CHECK-NEXT:    [[COL_LOAD372:%.*]] = load <2 x double>, ptr [[VEC_GEP371]], align 8
-; CHECK-NEXT:    [[TMP271:%.*]] = getelementptr double, ptr [[B]], i64 22
-; CHECK-NEXT:    [[COL_LOAD373:%.*]] = load <2 x double>, ptr [[TMP271]], align 8
-; CHECK-NEXT:    [[VEC_GEP374:%.*]] = getelementptr double, ptr [[TMP271]], i64 8
-; CHECK-NEXT:    [[COL_LOAD375:%.*]] = load <2 x double>, ptr [[VEC_GEP374]], align 8
-; CHECK-NEXT:    [[BLOCK376:%.*]] = shufflevector <2 x double> [[TMP263]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK377:%.*]] = shufflevector <2 x double> [[COL_LOAD370]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP272:%.*]] = extractelement <2 x double> [[COL_LOAD373]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT378:%.*]] = insertelement <2 x double> poison, double [[TMP272]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT379:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT378]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP273:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK377]], <2 x double> [[SPLAT_SPLAT379]], <2 x double> [[BLOCK376]])
-; CHECK-NEXT:    [[BLOCK380:%.*]] = shufflevector <2 x double> [[COL_LOAD372]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP274:%.*]] = extractelement <2 x double> [[COL_LOAD373]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT381:%.*]] = insertelement <2 x double> poison, double [[TMP274]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT382:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT381]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP275:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK380]], <2 x double> [[SPLAT_SPLAT382]], <2 x double> [[TMP273]])
-; CHECK-NEXT:    [[TMP276:%.*]] = shufflevector <2 x double> [[TMP275]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP277:%.*]] = shufflevector <2 x double> [[TMP263]], <2 x double> [[TMP276]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK383:%.*]] = shufflevector <2 x double> [[TMP269]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK384:%.*]] = shufflevector <2 x double> [[COL_LOAD370]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP278:%.*]] = extractelement <2 x double> [[COL_LOAD375]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT385:%.*]] = insertelement <2 x double> poison, double [[TMP278]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT386:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT385]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP279:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK384]], <2 x double> [[SPLAT_SPLAT386]], <2 x double> [[BLOCK383]])
-; CHECK-NEXT:    [[BLOCK387:%.*]] = shufflevector <2 x double> [[COL_LOAD372]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP280:%.*]] = extractelement <2 x double> [[COL_LOAD375]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT388:%.*]] = insertelement <2 x double> poison, double [[TMP280]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT389:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT388]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP281:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK387]], <2 x double> [[SPLAT_SPLAT389]], <2 x double> [[TMP279]])
-; CHECK-NEXT:    [[TMP282:%.*]] = shufflevector <2 x double> [[TMP281]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP283:%.*]] = shufflevector <2 x double> [[TMP269]], <2 x double> [[TMP282]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP284:%.*]] = getelementptr double, ptr [[C]], i64 16
-; CHECK-NEXT:    store <2 x double> [[TMP277]], ptr [[TMP284]], align 8
-; CHECK-NEXT:    [[VEC_GEP390:%.*]] = getelementptr double, ptr [[TMP284]], i64 8
-; CHECK-NEXT:    store <2 x double> [[TMP283]], ptr [[VEC_GEP390]], align 8
-; CHECK-NEXT:    [[TMP285:%.*]] = getelementptr double, ptr [[A]], i64 2
-; CHECK-NEXT:    [[COL_LOAD391:%.*]] = load <2 x double>, ptr [[TMP285]], align 8
-; CHECK-NEXT:    [[VEC_GEP392:%.*]] = getelementptr double, ptr [[TMP285]], i64 8
-; CHECK-NEXT:    [[COL_LOAD393:%.*]] = load <2 x double>, ptr [[VEC_GEP392]], align 8
-; CHECK-NEXT:    [[TMP286:%.*]] = getelementptr double, ptr [[B]], i64 16
-; CHECK-NEXT:    [[COL_LOAD394:%.*]] = load <2 x double>, ptr [[TMP286]], align 8
-; CHECK-NEXT:    [[VEC_GEP395:%.*]] = getelementptr double, ptr [[TMP286]], i64 8
-; CHECK-NEXT:    [[COL_LOAD396:%.*]] = load <2 x double>, ptr [[VEC_GEP395]], align 8
-; CHECK-NEXT:    [[BLOCK397:%.*]] = shufflevector <2 x double> [[COL_LOAD391]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP287:%.*]] = extractelement <2 x double> [[COL_LOAD394]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT398:%.*]] = insertelement <2 x double> poison, double [[TMP287]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT399:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT398]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP288:%.*]] = fmul contract <2 x double> [[BLOCK397]], [[SPLAT_SPLAT399]]
-; CHECK-NEXT:    [[BLOCK400:%.*]] = shufflevector <2 x double> [[COL_LOAD393]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP289:%.*]] = extractelement <2 x double> [[COL_LOAD394]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT401:%.*]] = insertelement <2 x double> poison, double [[TMP289]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT402:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT401]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP290:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK400]], <2 x double> [[SPLAT_SPLAT402]], <2 x double> [[TMP288]])
-; CHECK-NEXT:    [[TMP291:%.*]] = shufflevector <2 x double> [[TMP290]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP292:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP291]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK403:%.*]] = shufflevector <2 x double> [[COL_LOAD391]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP293:%.*]] = extractelement <2 x double> [[COL_LOAD396]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT404:%.*]] = insertelement <2 x double> poison, double [[TMP293]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT405:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT404]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP294:%.*]] = fmul contract <2 x double> [[BLOCK403]], [[SPLAT_SPLAT405]]
-; CHECK-NEXT:    [[BLOCK406:%.*]] = shufflevector <2 x double> [[COL_LOAD393]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP295:%.*]] = extractelement <2 x double> [[COL_LOAD396]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT407:%.*]] = insertelement <2 x double> poison, double [[TMP295]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT408:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT407]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP296:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK406]], <2 x double> [[SPLAT_SPLAT408]], <2 x double> [[TMP294]])
-; CHECK-NEXT:    [[TMP297:%.*]] = shufflevector <2 x double> [[TMP296]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP298:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP297]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP299:%.*]] = getelementptr double, ptr [[A]], i64 18
-; CHECK-NEXT:    [[COL_LOAD409:%.*]] = load <2 x double>, ptr [[TMP299]], align 8
-; CHECK-NEXT:    [[VEC_GEP410:%.*]] = getelementptr double, ptr [[TMP299]], i64 8
-; CHECK-NEXT:    [[COL_LOAD411:%.*]] = load <2 x double>, ptr [[VEC_GEP410]], align 8
-; CHECK-NEXT:    [[TMP300:%.*]] = getelementptr double, ptr [[B]], i64 18
-; CHECK-NEXT:    [[COL_LOAD412:%.*]] = load <2 x double>, ptr [[TMP300]], align 8
-; CHECK-NEXT:    [[VEC_GEP413:%.*]] = getelementptr double, ptr [[TMP300]], i64 8
-; CHECK-NEXT:    [[COL_LOAD414:%.*]] = load <2 x double>, ptr [[VEC_GEP413]], align 8
-; CHECK-NEXT:    [[BLOCK415:%.*]] = shufflevector <2 x double> [[TMP292]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK416:%.*]] = shufflevector <2 x double> [[COL_LOAD409]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP301:%.*]] = extractelement <2 x double> [[COL_LOAD412]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT417:%.*]] = insertelement <2 x double> poison, double [[TMP301]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT418:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT417]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP302:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK416]], <2 x double> [[SPLAT_SPLAT418]], <2 x double> [[BLOCK415]])
-; CHECK-NEXT:    [[BLOCK419:%.*]] = shufflevector <2 x double> [[COL_LOAD411]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP303:%.*]] = extractelement <2 x double> [[COL_LOAD412]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT420:%.*]] = insertelement <2 x double> poison, double [[TMP303]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT421:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT420]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP304:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK419]], <2 x double> [[SPLAT_SPLAT421]], <2 x double> [[TMP302]])
-; CHECK-NEXT:    [[TMP305:%.*]] = shufflevector <2 x double> [[TMP304]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP306:%.*]] = shufflevector <2 x double> [[TMP292]], <2 x double> [[TMP305]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK422:%.*]] = shufflevector <2 x double> [[TMP298]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK423:%.*]] = shufflevector <2 x double> [[COL_LOAD409]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP307:%.*]] = extractelement <2 x double> [[COL_LOAD414]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT424:%.*]] = insertelement <2 x double> poison, double [[TMP307]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT425:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT424]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP308:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK423]], <2 x double> [[SPLAT_SPLAT425]], <2 x double> [[BLOCK422]])
-; CHECK-NEXT:    [[BLOCK426:%.*]] = shufflevector <2 x double> [[COL_LOAD411]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP309:%.*]] = extractelement <2 x double> [[COL_LOAD414]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT427:%.*]] = insertelement <2 x double> poison, double [[TMP309]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT428:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT427]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP310:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK426]], <2 x double> [[SPLAT_SPLAT428]], <2 x double> [[TMP308]])
-; CHECK-NEXT:    [[TMP311:%.*]] = shufflevector <2 x double> [[TMP310]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP312:%.*]] = shufflevector <2 x double> [[TMP298]], <2 x double> [[TMP311]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP313:%.*]] = getelementptr double, ptr [[A]], i64 34
-; CHECK-NEXT:    [[COL_LOAD429:%.*]] = load <2 x double>, ptr [[TMP313]], align 8
-; CHECK-NEXT:    [[VEC_GEP430:%.*]] = getelementptr double, ptr [[TMP313]], i64 8
-; CHECK-NEXT:    [[COL_LOAD431:%.*]] = load <2 x double>, ptr [[VEC_GEP430]], align 8
-; CHECK-NEXT:    [[TMP314:%.*]] = getelementptr double, ptr [[B]], i64 20
-; CHECK-NEXT:    [[COL_LOAD432:%.*]] = load <2 x double>, ptr [[TMP314]], align 8
-; CHECK-NEXT:    [[VEC_GEP433:%.*]] = getelementptr double, ptr [[TMP314]], i64 8
-; CHECK-NEXT:    [[COL_LOAD434:%.*]] = load <2 x double>, ptr [[VEC_GEP433]], align 8
-; CHECK-NEXT:    [[BLOCK435:%.*]] = shufflevector <2 x double> [[TMP306]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK436:%.*]] = shufflevector <2 x double> [[COL_LOAD429]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP315:%.*]] = extractelement <2 x double> [[COL_LOAD432]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT437:%.*]] = insertelement <2 x double> poison, double [[TMP315]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT438:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT437]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP316:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK436]], <2 x double> [[SPLAT_SPLAT438]], <2 x double> [[BLOCK435]])
-; CHECK-NEXT:    [[BLOCK439:%.*]] = shufflevector <2 x double> [[COL_LOAD431]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP317:%.*]] = extractelement <2 x double> [[COL_LOAD432]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT440:%.*]] = insertelement <2 x double> poison, double [[TMP317]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT441:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT440]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP318:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK439]], <2 x double> [[SPLAT_SPLAT441]], <2 x double> [[TMP316]])
-; CHECK-NEXT:    [[TMP319:%.*]] = shufflevector <2 x double> [[TMP318]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP320:%.*]] = shufflevector <2 x double> [[TMP306]], <2 x double> [[TMP319]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK442:%.*]] = shufflevector <2 x double> [[TMP312]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK443:%.*]] = shufflevector <2 x double> [[COL_LOAD429]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP321:%.*]] = extractelement <2 x double> [[COL_LOAD434]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT444:%.*]] = insertelement <2 x double> poison, double [[TMP321]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT445:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT444]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP322:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK443]], <2 x double> [[SPLAT_SPLAT445]], <2 x double> [[BLOCK442]])
-; CHECK-NEXT:    [[BLOCK446:%.*]] = shufflevector <2 x double> [[COL_LOAD431]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP323:%.*]] = extractelement <2 x double> [[COL_LOAD434]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT447:%.*]] = insertelement <2 x double> poison, double [[TMP323]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT448:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT447]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP324:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK446]], <2 x double> [[SPLAT_SPLAT448]], <2 x double> [[TMP322]])
-; CHECK-NEXT:    [[TMP325:%.*]] = shufflevector <2 x double> [[TMP324]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP326:%.*]] = shufflevector <2 x double> [[TMP312]], <2 x double> [[TMP325]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP327:%.*]] = getelementptr double, ptr [[A]], i64 50
-; CHECK-NEXT:    [[COL_LOAD449:%.*]] = load <2 x double>, ptr [[TMP327]], align 8
-; CHECK-NEXT:    [[VEC_GEP450:%.*]] = getelementptr double, ptr [[TMP327]], i64 8
-; CHECK-NEXT:    [[COL_LOAD451:%.*]] = load <2 x double>, ptr [[VEC_GEP450]], align 8
-; CHECK-NEXT:    [[TMP328:%.*]] = getelementptr double, ptr [[B]], i64 22
-; CHECK-NEXT:    [[COL_LOAD452:%.*]] = load <2 x double>, ptr [[TMP328]], align 8
-; CHECK-NEXT:    [[VEC_GEP453:%.*]] = getelementptr double, ptr [[TMP328]], i64 8
-; CHECK-NEXT:    [[COL_LOAD454:%.*]] = load <2 x double>, ptr [[VEC_GEP453]], align 8
-; CHECK-NEXT:    [[BLOCK455:%.*]] = shufflevector <2 x double> [[TMP320]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK456:%.*]] = shufflevector <2 x double> [[COL_LOAD449]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP329:%.*]] = extractelement <2 x double> [[COL_LOAD452]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT457:%.*]] = insertelement <2 x double> poison, double [[TMP329]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT458:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT457]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP330:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK456]], <2 x double> [[SPLAT_SPLAT458]], <2 x double> [[BLOCK455]])
-; CHECK-NEXT:    [[BLOCK459:%.*]] = shufflevector <2 x double> [[COL_LOAD451]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP331:%.*]] = extractelement <2 x double> [[COL_LOAD452]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT460:%.*]] = insertelement <2 x double> poison, double [[TMP331]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT461:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT460]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP332:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK459]], <2 x double> [[SPLAT_SPLAT461]], <2 x double> [[TMP330]])
-; CHECK-NEXT:    [[TMP333:%.*]] = shufflevector <2 x double> [[TMP332]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP334:%.*]] = shufflevector <2 x double> [[TMP320]], <2 x double> [[TMP333]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK462:%.*]] = shufflevector <2 x double> [[TMP326]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK463:%.*]] = shufflevector <2 x double> [[COL_LOAD449]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP335:%.*]] = extractelement <2 x double> [[COL_LOAD454]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT464:%.*]] = insertelement <2 x double> poison, double [[TMP335]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT465:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT464]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP336:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK463]], <2 x double> [[SPLAT_SPLAT465]], <2 x double> [[BLOCK462]])
-; CHECK-NEXT:    [[BLOCK466:%.*]] = shufflevector <2 x double> [[COL_LOAD451]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP337:%.*]] = extractelement <2 x double> [[COL_LOAD454]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT467:%.*]] = insertelement <2 x double> poison, double [[TMP337]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT468:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT467]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP338:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK466]], <2 x double> [[SPLAT_SPLAT468]], <2 x double> [[TMP336]])
-; CHECK-NEXT:    [[TMP339:%.*]] = shufflevector <2 x double> [[TMP338]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP340:%.*]] = shufflevector <2 x double> [[TMP326]], <2 x double> [[TMP339]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP341:%.*]] = getelementptr double, ptr [[C]], i64 18
-; CHECK-NEXT:    store <2 x double> [[TMP334]], ptr [[TMP341]], align 8
-; CHECK-NEXT:    [[VEC_GEP469:%.*]] = getelementptr double, ptr [[TMP341]], i64 8
-; CHECK-NEXT:    store <2 x double> [[TMP340]], ptr [[VEC_GEP469]], align 8
-; CHECK-NEXT:    [[TMP342:%.*]] = getelementptr double, ptr [[A]], i64 4
-; CHECK-NEXT:    [[COL_LOAD470:%.*]] = load <2 x double>, ptr [[TMP342]], align 8
-; CHECK-NEXT:    [[VEC_GEP471:%.*]] = getelementptr double, ptr [[TMP342]], i64 8
-; CHECK-NEXT:    [[COL_LOAD472:%.*]] = load <2 x double>, ptr [[VEC_GEP471]], align 8
-; CHECK-NEXT:    [[TMP343:%.*]] = getelementptr double, ptr [[B]], i64 16
-; CHECK-NEXT:    [[COL_LOAD473:%.*]] = load <2 x double>, ptr [[TMP343]], align 8
-; CHECK-NEXT:    [[VEC_GEP474:%.*]] = getelementptr double, ptr [[TMP343]], i64 8
-; CHECK-NEXT:    [[COL_LOAD475:%.*]] = load <2 x double>, ptr [[VEC_GEP474]], align 8
-; CHECK-NEXT:    [[BLOCK476:%.*]] = shufflevector <2 x double> [[COL_LOAD470]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP344:%.*]] = extractelement <2 x double> [[COL_LOAD473]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT477:%.*]] = insertelement <2 x double> poison, double [[TMP344]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT478:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT477]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP345:%.*]] = fmul contract <2 x double> [[BLOCK476]], [[SPLAT_SPLAT478]]
-; CHECK-NEXT:    [[BLOCK479:%.*]] = shufflevector <2 x double> [[COL_LOAD472]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP346:%.*]] = extractelement <2 x double> [[COL_LOAD473]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT480:%.*]] = insertelement <2 x double> poison, double [[TMP346]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT481:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT480]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP347:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK479]], <2 x double> [[SPLAT_SPLAT481]], <2 x double> [[TMP345]])
-; CHECK-NEXT:    [[TMP348:%.*]] = shufflevector <2 x double> [[TMP347]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP349:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP348]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK482:%.*]] = shufflevector <2 x double> [[COL_LOAD470]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP350:%.*]] = extractelement <2 x double> [[COL_LOAD475]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT483:%.*]] = insertelement <2 x double> poison, double [[TMP350]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT484:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT483]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP351:%.*]] = fmul contract <2 x double> [[BLOCK482]], [[SPLAT_SPLAT484]]
-; CHECK-NEXT:    [[BLOCK485:%.*]] = shufflevector <2 x double> [[COL_LOAD472]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP352:%.*]] = extractelement <2 x double> [[COL_LOAD475]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT486:%.*]] = insertelement <2 x double> poison, double [[TMP352]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT487:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT486]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP353:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK485]], <2 x double> [[SPLAT_SPLAT487]], <2 x double> [[TMP351]])
-; CHECK-NEXT:    [[TMP354:%.*]] = shufflevector <2 x double> [[TMP353]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP355:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP354]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP356:%.*]] = getelementptr double, ptr [[A]], i64 20
-; CHECK-NEXT:    [[COL_LOAD488:%.*]] = load <2 x double>, ptr [[TMP356]], align 8
-; CHECK-NEXT:    [[VEC_GEP489:%.*]] = getelementptr double, ptr [[TMP356]], i64 8
-; CHECK-NEXT:    [[COL_LOAD490:%.*]] = load <2 x double>, ptr [[VEC_GEP489]], align 8
-; CHECK-NEXT:    [[TMP357:%.*]] = getelementptr double, ptr [[B]], i64 18
-; CHECK-NEXT:    [[COL_LOAD491:%.*]] = load <2 x double>, ptr [[TMP357]], align 8
-; CHECK-NEXT:    [[VEC_GEP492:%.*]] = getelementptr double, ptr [[TMP357]], i64 8
-; CHECK-NEXT:    [[COL_LOAD493:%.*]] = load <2 x double>, ptr [[VEC_GEP492]], align 8
-; CHECK-NEXT:    [[BLOCK494:%.*]] = shufflevector <2 x double> [[TMP349]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK495:%.*]] = shufflevector <2 x double> [[COL_LOAD488]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP358:%.*]] = extractelement <2 x double> [[COL_LOAD491]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT496:%.*]] = insertelement <2 x double> poison, double [[TMP358]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT497:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT496]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP359:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK495]], <2 x double> [[SPLAT_SPLAT497]], <2 x double> [[BLOCK494]])
-; CHECK-NEXT:    [[BLOCK498:%.*]] = shufflevector <2 x double> [[COL_LOAD490]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP360:%.*]] = extractelement <2 x double> [[COL_LOAD491]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT499:%.*]] = insertelement <2 x double> poison, double [[TMP360]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT500:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT499]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP361:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK498]], <2 x double> [[SPLAT_SPLAT500]], <2 x double> [[TMP359]])
-; CHECK-NEXT:    [[TMP362:%.*]] = shufflevector <2 x double> [[TMP361]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP363:%.*]] = shufflevector <2 x double> [[TMP349]], <2 x double> [[TMP362]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK501:%.*]] = shufflevector <2 x double> [[TMP355]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK502:%.*]] = shufflevector <2 x double> [[COL_LOAD488]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP364:%.*]] = extractelement <2 x double> [[COL_LOAD493]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT503:%.*]] = insertelement <2 x double> poison, double [[TMP364]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT504:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT503]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP365:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK502]], <2 x double> [[SPLAT_SPLAT504]], <2 x double> [[BLOCK501]])
-; CHECK-NEXT:    [[BLOCK505:%.*]] = shufflevector <2 x double> [[COL_LOAD490]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP366:%.*]] = extractelement <2 x double> [[COL_LOAD493]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT506:%.*]] = insertelement <2 x double> poison, double [[TMP366]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT507:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT506]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP367:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK505]], <2 x double> [[SPLAT_SPLAT507]], <2 x double> [[TMP365]])
-; CHECK-NEXT:    [[TMP368:%.*]] = shufflevector <2 x double> [[TMP367]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP369:%.*]] = shufflevector <2 x double> [[TMP355]], <2 x double> [[TMP368]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP370:%.*]] = getelementptr double, ptr [[A]], i64 36
-; CHECK-NEXT:    [[COL_LOAD508:%.*]] = load <2 x double>, ptr [[TMP370]], align 8
-; CHECK-NEXT:    [[VEC_GEP509:%.*]] = getelementptr double, ptr [[TMP370]], i64 8
-; CHECK-NEXT:    [[COL_LOAD510:%.*]] = load <2 x double>, ptr [[VEC_GEP509]], align 8
-; CHECK-NEXT:    [[TMP371:%.*]] = getelementptr double, ptr [[B]], i64 20
-; CHECK-NEXT:    [[COL_LOAD511:%.*]] = load <2 x double>, ptr [[TMP371]], align 8
-; CHECK-NEXT:    [[VEC_GEP512:%.*]] = getelementptr double, ptr [[TMP371]], i64 8
-; CHECK-NEXT:    [[COL_LOAD513:%.*]] = load <2 x double>, ptr [[VEC_GEP512]], align 8
-; CHECK-NEXT:    [[BLOCK514:%.*]] = shufflevector <2 x double> [[TMP363]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK515:%.*]] = shufflevector <2 x double> [[COL_LOAD508]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP372:%.*]] = extractelement <2 x double> [[COL_LOAD511]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT516:%.*]] = insertelement <2 x double> poison, double [[TMP372]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT517:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT516]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP373:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK515]], <2 x double> [[SPLAT_SPLAT517]], <2 x double> [[BLOCK514]])
-; CHECK-NEXT:    [[BLOCK518:%.*]] = shufflevector <2 x double> [[COL_LOAD510]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP374:%.*]] = extractelement <2 x double> [[COL_LOAD511]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT519:%.*]] = insertelement <2 x double> poison, double [[TMP374]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT520:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT519]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP375:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK518]], <2 x double> [[SPLAT_SPLAT520]], <2 x double> [[TMP373]])
-; CHECK-NEXT:    [[TMP376:%.*]] = shufflevector <2 x double> [[TMP375]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP377:%.*]] = shufflevector <2 x double> [[TMP363]], <2 x double> [[TMP376]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK521:%.*]] = shufflevector <2 x double> [[TMP369]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK522:%.*]] = shufflevector <2 x double> [[COL_LOAD508]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP378:%.*]] = extractelement <2 x double> [[COL_LOAD513]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT523:%.*]] = insertelement <2 x double> poison, double [[TMP378]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT524:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT523]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP379:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK522]], <2 x double> [[SPLAT_SPLAT524]], <2 x double> [[BLOCK521]])
-; CHECK-NEXT:    [[BLOCK525:%.*]] = shufflevector <2 x double> [[COL_LOAD510]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP380:%.*]] = extractelement <2 x double> [[COL_LOAD513]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT526:%.*]] = insertelement <2 x double> poison, double [[TMP380]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT527:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT526]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP381:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK525]], <2 x double> [[SPLAT_SPLAT527]], <2 x double> [[TMP379]])
-; CHECK-NEXT:    [[TMP382:%.*]] = shufflevector <2 x double> [[TMP381]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP383:%.*]] = shufflevector <2 x double> [[TMP369]], <2 x double> [[TMP382]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP384:%.*]] = getelementptr double, ptr [[A]], i64 52
-; CHECK-NEXT:    [[COL_LOAD528:%.*]] = load <2 x double>, ptr [[TMP384]], align 8
-; CHECK-NEXT:    [[VEC_GEP529:%.*]] = getelementptr double, ptr [[TMP384]], i64 8
-; CHECK-NEXT:    [[COL_LOAD530:%.*]] = load <2 x double>, ptr [[VEC_GEP529]], align 8
-; CHECK-NEXT:    [[TMP385:%.*]] = getelementptr double, ptr [[B]], i64 22
-; CHECK-NEXT:    [[COL_LOAD531:%.*]] = load <2 x double>, ptr [[TMP385]], align 8
-; CHECK-NEXT:    [[VEC_GEP532:%.*]] = getelementptr double, ptr [[TMP385]], i64 8
-; CHECK-NEXT:    [[COL_LOAD533:%.*]] = load <2 x double>, ptr [[VEC_GEP532]], align 8
-; CHECK-NEXT:    [[BLOCK534:%.*]] = shufflevector <2 x double> [[TMP377]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK535:%.*]] = shufflevector <2 x double> [[COL_LOAD528]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP386:%.*]] = extractelement <2 x double> [[COL_LOAD531]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT536:%.*]] = insertelement <2 x double> poison, double [[TMP386]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT537:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT536]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP387:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK535]], <2 x double> [[SPLAT_SPLAT537]], <2 x double> [[BLOCK534]])
-; CHECK-NEXT:    [[BLOCK538:%.*]] = shufflevector <2 x double> [[COL_LOAD530]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP388:%.*]] = extractelement <2 x double> [[COL_LOAD531]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT539:%.*]] = insertelement <2 x double> poison, double [[TMP388]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT540:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT539]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP389:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK538]], <2 x double> [[SPLAT_SPLAT540]], <2 x double> [[TMP387]])
-; CHECK-NEXT:    [[TMP390:%.*]] = shufflevector <2 x double> [[TMP389]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP391:%.*]] = shufflevector <2 x double> [[TMP377]], <2 x double> [[TMP390]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK541:%.*]] = shufflevector <2 x double> [[TMP383]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK542:%.*]] = shufflevector <2 x double> [[COL_LOAD528]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP392:%.*]] = extractelement <2 x double> [[COL_LOAD533]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT543:%.*]] = insertelement <2 x double> poison, double [[TMP392]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT544:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT543]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP393:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK542]], <2 x double> [[SPLAT_SPLAT544]], <2 x double> [[BLOCK541]])
-; CHECK-NEXT:    [[BLOCK545:%.*]] = shufflevector <2 x double> [[COL_LOAD530]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP394:%.*]] = extractelement <2 x double> [[COL_LOAD533]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT546:%.*]] = insertelement <2 x double> poison, double [[TMP394]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT547:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT546]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP395:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK545]], <2 x double> [[SPLAT_SPLAT547]], <2 x double> [[TMP393]])
-; CHECK-NEXT:    [[TMP396:%.*]] = shufflevector <2 x double> [[TMP395]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP397:%.*]] = shufflevector <2 x double> [[TMP383]], <2 x double> [[TMP396]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP398:%.*]] = getelementptr double, ptr [[C]], i64 20
-; CHECK-NEXT:    store <2 x double> [[TMP391]], ptr [[TMP398]], align 8
-; CHECK-NEXT:    [[VEC_GEP548:%.*]] = getelementptr double, ptr [[TMP398]], i64 8
-; CHECK-NEXT:    store <2 x double> [[TMP397]], ptr [[VEC_GEP548]], align 8
-; CHECK-NEXT:    [[TMP399:%.*]] = getelementptr double, ptr [[A]], i64 6
-; CHECK-NEXT:    [[COL_LOAD549:%.*]] = load <2 x double>, ptr [[TMP399]], align 8
-; CHECK-NEXT:    [[VEC_GEP550:%.*]] = getelementptr double, ptr [[TMP399]], i64 8
-; CHECK-NEXT:    [[COL_LOAD551:%.*]] = load <2 x double>, ptr [[VEC_GEP550]], align 8
-; CHECK-NEXT:    [[TMP400:%.*]] = getelementptr double, ptr [[B]], i64 16
-; CHECK-NEXT:    [[COL_LOAD552:%.*]] = load <2 x double>, ptr [[TMP400]], align 8
-; CHECK-NEXT:    [[VEC_GEP553:%.*]] = getelementptr double, ptr [[TMP400]], i64 8
-; CHECK-NEXT:    [[COL_LOAD554:%.*]] = load <2 x double>, ptr [[VEC_GEP553]], align 8
-; CHECK-NEXT:    [[BLOCK555:%.*]] = shufflevector <2 x double> [[COL_LOAD549]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP401:%.*]] = extractelement <2 x double> [[COL_LOAD552]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT556:%.*]] = insertelement <2 x double> poison, double [[TMP401]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT557:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT556]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP402:%.*]] = fmul contract <2 x double> [[BLOCK555]], [[SPLAT_SPLAT557]]
-; CHECK-NEXT:    [[BLOCK558:%.*]] = shufflevector <2 x double> [[COL_LOAD551]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP403:%.*]] = extractelement <2 x double> [[COL_LOAD552]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT559:%.*]] = insertelement <2 x double> poison, double [[TMP403]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT560:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT559]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP404:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK558]], <2 x double> [[SPLAT_SPLAT560]], <2 x double> [[TMP402]])
-; CHECK-NEXT:    [[TMP405:%.*]] = shufflevector <2 x double> [[TMP404]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP406:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP405]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK561:%.*]] = shufflevector <2 x double> [[COL_LOAD549]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP407:%.*]] = extractelement <2 x double> [[COL_LOAD554]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT562:%.*]] = insertelement <2 x double> poison, double [[TMP407]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT563:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT562]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP408:%.*]] = fmul contract <2 x double> [[BLOCK561]], [[SPLAT_SPLAT563]]
-; CHECK-NEXT:    [[BLOCK564:%.*]] = shufflevector <2 x double> [[COL_LOAD551]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP409:%.*]] = extractelement <2 x double> [[COL_LOAD554]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT565:%.*]] = insertelement <2 x double> poison, double [[TMP409]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT566:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT565]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP410:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK564]], <2 x double> [[SPLAT_SPLAT566]], <2 x double> [[TMP408]])
-; CHECK-NEXT:    [[TMP411:%.*]] = shufflevector <2 x double> [[TMP410]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP412:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP411]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP413:%.*]] = getelementptr double, ptr [[A]], i64 22
-; CHECK-NEXT:    [[COL_LOAD567:%.*]] = load <2 x double>, ptr [[TMP413]], align 8
-; CHECK-NEXT:    [[VEC_GEP568:%.*]] = getelementptr double, ptr [[TMP413]], i64 8
-; CHECK-NEXT:    [[COL_LOAD569:%.*]] = load <2 x double>, ptr [[VEC_GEP568]], align 8
-; CHECK-NEXT:    [[TMP414:%.*]] = getelementptr double, ptr [[B]], i64 18
-; CHECK-NEXT:    [[COL_LOAD570:%.*]] = load <2 x double>, ptr [[TMP414]], align 8
-; CHECK-NEXT:    [[VEC_GEP571:%.*]] = getelementptr double, ptr [[TMP414]], i64 8
-; CHECK-NEXT:    [[COL_LOAD572:%.*]] = load <2 x double>, ptr [[VEC_GEP571]], align 8
-; CHECK-NEXT:    [[BLOCK573:%.*]] = shufflevector <2 x double> [[TMP406]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK574:%.*]] = shufflevector <2 x double> [[COL_LOAD567]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP415:%.*]] = extractelement <2 x double> [[COL_LOAD570]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT575:%.*]] = insertelement <2 x double> poison, double [[TMP415]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT576:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT575]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP416:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK574]], <2 x double> [[SPLAT_SPLAT576]], <2 x double> [[BLOCK573]])
-; CHECK-NEXT:    [[BLOCK577:%.*]] = shufflevector <2 x double> [[COL_LOAD569]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP417:%.*]] = extractelement <2 x double> [[COL_LOAD570]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT578:%.*]] = insertelement <2 x double> poison, double [[TMP417]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT579:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT578]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP418:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK577]], <2 x double> [[SPLAT_SPLAT579]], <2 x double> [[TMP416]])
-; CHECK-NEXT:    [[TMP419:%.*]] = shufflevector <2 x double> [[TMP418]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP420:%.*]] = shufflevector <2 x double> [[TMP406]], <2 x double> [[TMP419]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK580:%.*]] = shufflevector <2 x double> [[TMP412]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK581:%.*]] = shufflevector <2 x double> [[COL_LOAD567]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP421:%.*]] = extractelement <2 x double> [[COL_LOAD572]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT582:%.*]] = insertelement <2 x double> poison, double [[TMP421]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT583:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT582]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP422:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK581]], <2 x double> [[SPLAT_SPLAT583]], <2 x double> [[BLOCK580]])
-; CHECK-NEXT:    [[BLOCK584:%.*]] = shufflevector <2 x double> [[COL_LOAD569]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP423:%.*]] = extractelement <2 x double> [[COL_LOAD572]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT585:%.*]] = insertelement <2 x double> poison, double [[TMP423]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT586:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT585]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP424:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK584]], <2 x double> [[SPLAT_SPLAT586]], <2 x double> [[TMP422]])
-; CHECK-NEXT:    [[TMP425:%.*]] = shufflevector <2 x double> [[TMP424]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP426:%.*]] = shufflevector <2 x double> [[TMP412]], <2 x double> [[TMP425]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP427:%.*]] = getelementptr double, ptr [[A]], i64 38
-; CHECK-NEXT:    [[COL_LOAD587:%.*]] = load <2 x double>, ptr [[TMP427]], align 8
-; CHECK-NEXT:    [[VEC_GEP588:%.*]] = getelementptr double, ptr [[TMP427]], i64 8
-; CHECK-NEXT:    [[COL_LOAD589:%.*]] = load <2 x double>, ptr [[VEC_GEP588]], align 8
-; CHECK-NEXT:    [[TMP428:%.*]] = getelementptr double, ptr [[B]], i64 20
-; CHECK-NEXT:    [[COL_LOAD590:%.*]] = load <2 x double>, ptr [[TMP428]], align 8
-; CHECK-NEXT:    [[VEC_GEP591:%.*]] = getelementptr double, ptr [[TMP428]], i64 8
-; CHECK-NEXT:    [[COL_LOAD592:%.*]] = load <2 x double>, ptr [[VEC_GEP591]], align 8
-; CHECK-NEXT:    [[BLOCK593:%.*]] = shufflevector <2 x double> [[TMP420]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK594:%.*]] = shufflevector <2 x double> [[COL_LOAD587]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP429:%.*]] = extractelement <2 x double> [[COL_LOAD590]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT595:%.*]] = insertelement <2 x double> poison, double [[TMP429]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT596:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT595]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP430:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK594]], <2 x double> [[SPLAT_SPLAT596]], <2 x double> [[BLOCK593]])
-; CHECK-NEXT:    [[BLOCK597:%.*]] = shufflevector <2 x double> [[COL_LOAD589]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP431:%.*]] = extractelement <2 x double> [[COL_LOAD590]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT598:%.*]] = insertelement <2 x double> poison, double [[TMP431]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT599:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT598]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP432:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK597]], <2 x double> [[SPLAT_SPLAT599]], <2 x double> [[TMP430]])
-; CHECK-NEXT:    [[TMP433:%.*]] = shufflevector <2 x double> [[TMP432]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP434:%.*]] = shufflevector <2 x double> [[TMP420]], <2 x double> [[TMP433]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK600:%.*]] = shufflevector <2 x double> [[TMP426]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK601:%.*]] = shufflevector <2 x double> [[COL_LOAD587]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP435:%.*]] = extractelement <2 x double> [[COL_LOAD592]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT602:%.*]] = insertelement <2 x double> poison, double [[TMP435]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT603:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT602]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP436:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK601]], <2 x double> [[SPLAT_SPLAT603]], <2 x double> [[BLOCK600]])
-; CHECK-NEXT:    [[BLOCK604:%.*]] = shufflevector <2 x double> [[COL_LOAD589]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP437:%.*]] = extractelement <2 x double> [[COL_LOAD592]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT605:%.*]] = insertelement <2 x double> poison, double [[TMP437]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT606:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT605]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP438:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK604]], <2 x double> [[SPLAT_SPLAT606]], <2 x double> [[TMP436]])
-; CHECK-NEXT:    [[TMP439:%.*]] = shufflevector <2 x double> [[TMP438]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP440:%.*]] = shufflevector <2 x double> [[TMP426]], <2 x double> [[TMP439]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP441:%.*]] = getelementptr double, ptr [[A]], i64 54
-; CHECK-NEXT:    [[COL_LOAD607:%.*]] = load <2 x double>, ptr [[TMP441]], align 8
-; CHECK-NEXT:    [[VEC_GEP608:%.*]] = getelementptr double, ptr [[TMP441]], i64 8
-; CHECK-NEXT:    [[COL_LOAD609:%.*]] = load <2 x double>, ptr [[VEC_GEP608]], align 8
-; CHECK-NEXT:    [[TMP442:%.*]] = getelementptr double, ptr [[B]], i64 22
-; CHECK-NEXT:    [[COL_LOAD610:%.*]] = load <2 x double>, ptr [[TMP442]], align 8
-; CHECK-NEXT:    [[VEC_GEP611:%.*]] = getelementptr double, ptr [[TMP442]], i64 8
-; CHECK-NEXT:    [[COL_LOAD612:%.*]] = load <2 x double>, ptr [[VEC_GEP611]], align 8
-; CHECK-NEXT:    [[BLOCK613:%.*]] = shufflevector <2 x double> [[TMP434]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK614:%.*]] = shufflevector <2 x double> [[COL_LOAD607]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP443:%.*]] = extractelement <2 x double> [[COL_LOAD610]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT615:%.*]] = insertelement <2 x double> poison, double [[TMP443]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT616:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT615]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP444:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK614]], <2 x double> [[SPLAT_SPLAT616]], <2 x double> [[BLOCK613]])
-; CHECK-NEXT:    [[BLOCK617:%.*]] = shufflevector <2 x double> [[COL_LOAD609]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP445:%.*]] = extractelement <2 x double> [[COL_LOAD610]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT618:%.*]] = insertelement <2 x double> poison, double [[TMP445]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT619:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT618]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP446:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK617]], <2 x double> [[SPLAT_SPLAT619]], <2 x double> [[TMP444]])
-; CHECK-NEXT:    [[TMP447:%.*]] = shufflevector <2 x double> [[TMP446]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP448:%.*]] = shufflevector <2 x double> [[TMP434]], <2 x double> [[TMP447]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK620:%.*]] = shufflevector <2 x double> [[TMP440]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK621:%.*]] = shufflevector <2 x double> [[COL_LOAD607]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP449:%.*]] = extractelement <2 x double> [[COL_LOAD612]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT622:%.*]] = insertelement <2 x double> poison, double [[TMP449]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT623:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT622]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP450:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK621]], <2 x double> [[SPLAT_SPLAT623]], <2 x double> [[BLOCK620]])
-; CHECK-NEXT:    [[BLOCK624:%.*]] = shufflevector <2 x double> [[COL_LOAD609]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP451:%.*]] = extractelement <2 x double> [[COL_LOAD612]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT625:%.*]] = insertelement <2 x double> poison, double [[TMP451]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT626:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT625]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP452:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK624]], <2 x double> [[SPLAT_SPLAT626]], <2 x double> [[TMP450]])
-; CHECK-NEXT:    [[TMP453:%.*]] = shufflevector <2 x double> [[TMP452]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP454:%.*]] = shufflevector <2 x double> [[TMP440]], <2 x double> [[TMP453]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP455:%.*]] = getelementptr double, ptr [[C]], i64 22
-; CHECK-NEXT:    store <2 x double> [[TMP448]], ptr [[TMP455]], align 8
-; CHECK-NEXT:    [[VEC_GEP627:%.*]] = getelementptr double, ptr [[TMP455]], i64 8
-; CHECK-NEXT:    store <2 x double> [[TMP454]], ptr [[VEC_GEP627]], align 8
-; CHECK-NEXT:    [[TMP456:%.*]] = getelementptr double, ptr [[A]], i64 0
-; CHECK-NEXT:    [[COL_LOAD628:%.*]] = load <2 x double>, ptr [[TMP456]], align 8
-; CHECK-NEXT:    [[VEC_GEP629:%.*]] = getelementptr double, ptr [[TMP456]], i64 8
-; CHECK-NEXT:    [[COL_LOAD630:%.*]] = load <2 x double>, ptr [[VEC_GEP629]], align 8
-; CHECK-NEXT:    [[TMP457:%.*]] = getelementptr double, ptr [[B]], i64 32
-; CHECK-NEXT:    [[COL_LOAD631:%.*]] = load <2 x double>, ptr [[TMP457]], align 8
-; CHECK-NEXT:    [[VEC_GEP632:%.*]] = getelementptr double, ptr [[TMP457]], i64 8
-; CHECK-NEXT:    [[COL_LOAD633:%.*]] = load <2 x double>, ptr [[VEC_GEP632]], align 8
-; CHECK-NEXT:    [[BLOCK634:%.*]] = shufflevector <2 x double> [[COL_LOAD628]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP458:%.*]] = extractelement <2 x double> [[COL_LOAD631]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT635:%.*]] = insertelement <2 x double> poison, double [[TMP458]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT636:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT635]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP459:%.*]] = fmul contract <2 x double> [[BLOCK634]], [[SPLAT_SPLAT636]]
-; CHECK-NEXT:    [[BLOCK637:%.*]] = shufflevector <2 x double> [[COL_LOAD630]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP460:%.*]] = extractelement <2 x double> [[COL_LOAD631]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT638:%.*]] = insertelement <2 x double> poison, double [[TMP460]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT639:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT638]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP461:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK637]], <2 x double> [[SPLAT_SPLAT639]], <2 x double> [[TMP459]])
-; CHECK-NEXT:    [[TMP462:%.*]] = shufflevector <2 x double> [[TMP461]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP463:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP462]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK640:%.*]] = shufflevector <2 x double> [[COL_LOAD628]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP464:%.*]] = extractelement <2 x double> [[COL_LOAD633]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT641:%.*]] = insertelement <2 x double> poison, double [[TMP464]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT642:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT641]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP465:%.*]] = fmul contract <2 x double> [[BLOCK640]], [[SPLAT_SPLAT642]]
-; CHECK-NEXT:    [[BLOCK643:%.*]] = shufflevector <2 x double> [[COL_LOAD630]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP466:%.*]] = extractelement <2 x double> [[COL_LOAD633]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT644:%.*]] = insertelement <2 x double> poison, double [[TMP466]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT645:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT644]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP467:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK643]], <2 x double> [[SPLAT_SPLAT645]], <2 x double> [[TMP465]])
-; CHECK-NEXT:    [[TMP468:%.*]] = shufflevector <2 x double> [[TMP467]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP469:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP468]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP470:%.*]] = getelementptr double, ptr [[A]], i64 16
-; CHECK-NEXT:    [[COL_LOAD646:%.*]] = load <2 x double>, ptr [[TMP470]], align 8
-; CHECK-NEXT:    [[VEC_GEP647:%.*]] = getelementptr double, ptr [[TMP470]], i64 8
-; CHECK-NEXT:    [[COL_LOAD648:%.*]] = load <2 x double>, ptr [[VEC_GEP647]], align 8
-; CHECK-NEXT:    [[TMP471:%.*]] = getelementptr double, ptr [[B]], i64 34
-; CHECK-NEXT:    [[COL_LOAD649:%.*]] = load <2 x double>, ptr [[TMP471]], align 8
-; CHECK-NEXT:    [[VEC_GEP650:%.*]] = getelementptr double, ptr [[TMP471]], i64 8
-; CHECK-NEXT:    [[COL_LOAD651:%.*]] = load <2 x double>, ptr [[VEC_GEP650]], align 8
-; CHECK-NEXT:    [[BLOCK652:%.*]] = shufflevector <2 x double> [[TMP463]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK653:%.*]] = shufflevector <2 x double> [[COL_LOAD646]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP472:%.*]] = extractelement <2 x double> [[COL_LOAD649]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT654:%.*]] = insertelement <2 x double> poison, double [[TMP472]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT655:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT654]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP473:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK653]], <2 x double> [[SPLAT_SPLAT655]], <2 x double> [[BLOCK652]])
-; CHECK-NEXT:    [[BLOCK656:%.*]] = shufflevector <2 x double> [[COL_LOAD648]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP474:%.*]] = extractelement <2 x double> [[COL_LOAD649]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT657:%.*]] = insertelement <2 x double> poison, double [[TMP474]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT658:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT657]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP475:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK656]], <2 x double> [[SPLAT_SPLAT658]], <2 x double> [[TMP473]])
-; CHECK-NEXT:    [[TMP476:%.*]] = shufflevector <2 x double> [[TMP475]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP477:%.*]] = shufflevector <2 x double> [[TMP463]], <2 x double> [[TMP476]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK659:%.*]] = shufflevector <2 x double> [[TMP469]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK660:%.*]] = shufflevector <2 x double> [[COL_LOAD646]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP478:%.*]] = extractelement <2 x double> [[COL_LOAD651]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT661:%.*]] = insertelement <2 x double> poison, double [[TMP478]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT662:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT661]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP479:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK660]], <2 x double> [[SPLAT_SPLAT662]], <2 x double> [[BLOCK659]])
-; CHECK-NEXT:    [[BLOCK663:%.*]] = shufflevector <2 x double> [[COL_LOAD648]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP480:%.*]] = extractelement <2 x double> [[COL_LOAD651]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT664:%.*]] = insertelement <2 x double> poison, double [[TMP480]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT665:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT664]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP481:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK663]], <2 x double> [[SPLAT_SPLAT665]], <2 x double> [[TMP479]])
-; CHECK-NEXT:    [[TMP482:%.*]] = shufflevector <2 x double> [[TMP481]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP483:%.*]] = shufflevector <2 x double> [[TMP469]], <2 x double> [[TMP482]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP484:%.*]] = getelementptr double, ptr [[A]], i64 32
-; CHECK-NEXT:    [[COL_LOAD666:%.*]] = load <2 x double>, ptr [[TMP484]], align 8
-; CHECK-NEXT:    [[VEC_GEP667:%.*]] = getelementptr double, ptr [[TMP484]], i64 8
-; CHECK-NEXT:    [[COL_LOAD668:%.*]] = load <2 x double>, ptr [[VEC_GEP667]], align 8
-; CHECK-NEXT:    [[TMP485:%.*]] = getelementptr double, ptr [[B]], i64 36
-; CHECK-NEXT:    [[COL_LOAD669:%.*]] = load <2 x double>, ptr [[TMP485]], align 8
-; CHECK-NEXT:    [[VEC_GEP670:%.*]] = getelementptr double, ptr [[TMP485]], i64 8
-; CHECK-NEXT:    [[COL_LOAD671:%.*]] = load <2 x double>, ptr [[VEC_GEP670]], align 8
-; CHECK-NEXT:    [[BLOCK672:%.*]] = shufflevector <2 x double> [[TMP477]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK673:%.*]] = shufflevector <2 x double> [[COL_LOAD666]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP486:%.*]] = extractelement <2 x double> [[COL_LOAD669]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT674:%.*]] = insertelement <2 x double> poison, double [[TMP486]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT675:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT674]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP487:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK673]], <2 x double> [[SPLAT_SPLAT675]], <2 x double> [[BLOCK672]])
-; CHECK-NEXT:    [[BLOCK676:%.*]] = shufflevector <2 x double> [[COL_LOAD668]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP488:%.*]] = extractelement <2 x double> [[COL_LOAD669]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT677:%.*]] = insertelement <2 x double> poison, double [[TMP488]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT678:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT677]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP489:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK676]], <2 x double> [[SPLAT_SPLAT678]], <2 x double> [[TMP487]])
-; CHECK-NEXT:    [[TMP490:%.*]] = shufflevector <2 x double> [[TMP489]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP491:%.*]] = shufflevector <2 x double> [[TMP477]], <2 x double> [[TMP490]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK679:%.*]] = shufflevector <2 x double> [[TMP483]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK680:%.*]] = shufflevector <2 x double> [[COL_LOAD666]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP492:%.*]] = extractelement <2 x double> [[COL_LOAD671]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT681:%.*]] = insertelement <2 x double> poison, double [[TMP492]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT682:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT681]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP493:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK680]], <2 x double> [[SPLAT_SPLAT682]], <2 x double> [[BLOCK679]])
-; CHECK-NEXT:    [[BLOCK683:%.*]] = shufflevector <2 x double> [[COL_LOAD668]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP494:%.*]] = extractelement <2 x double> [[COL_LOAD671]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT684:%.*]] = insertelement <2 x double> poison, double [[TMP494]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT685:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT684]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP495:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK683]], <2 x double> [[SPLAT_SPLAT685]], <2 x double> [[TMP493]])
-; CHECK-NEXT:    [[TMP496:%.*]] = shufflevector <2 x double> [[TMP495]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP497:%.*]] = shufflevector <2 x double> [[TMP483]], <2 x double> [[TMP496]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP498:%.*]] = getelementptr double, ptr [[A]], i64 48
-; CHECK-NEXT:    [[COL_LOAD686:%.*]] = load <2 x double>, ptr [[TMP498]], align 8
-; CHECK-NEXT:    [[VEC_GEP687:%.*]] = getelementptr double, ptr [[TMP498]], i64 8
-; CHECK-NEXT:    [[COL_LOAD688:%.*]] = load <2 x double>, ptr [[VEC_GEP687]], align 8
-; CHECK-NEXT:    [[TMP499:%.*]] = getelementptr double, ptr [[B]], i64 38
-; CHECK-NEXT:    [[COL_LOAD689:%.*]] = load <2 x double>, ptr [[TMP499]], align 8
-; CHECK-NEXT:    [[VEC_GEP690:%.*]] = getelementptr double, ptr [[TMP499]], i64 8
-; CHECK-NEXT:    [[COL_LOAD691:%.*]] = load <2 x double>, ptr [[VEC_GEP690]], align 8
-; CHECK-NEXT:    [[BLOCK692:%.*]] = shufflevector <2 x double> [[TMP491]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK693:%.*]] = shufflevector <2 x double> [[COL_LOAD686]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP500:%.*]] = extractelement <2 x double> [[COL_LOAD689]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT694:%.*]] = insertelement <2 x double> poison, double [[TMP500]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT695:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT694]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP501:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK693]], <2 x double> [[SPLAT_SPLAT695]], <2 x double> [[BLOCK692]])
-; CHECK-NEXT:    [[BLOCK696:%.*]] = shufflevector <2 x double> [[COL_LOAD688]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP502:%.*]] = extractelement <2 x double> [[COL_LOAD689]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT697:%.*]] = insertelement <2 x double> poison, double [[TMP502]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT698:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT697]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP503:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK696]], <2 x double> [[SPLAT_SPLAT698]], <2 x double> [[TMP501]])
-; CHECK-NEXT:    [[TMP504:%.*]] = shufflevector <2 x double> [[TMP503]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP505:%.*]] = shufflevector <2 x double> [[TMP491]], <2 x double> [[TMP504]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK699:%.*]] = shufflevector <2 x double> [[TMP497]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK700:%.*]] = shufflevector <2 x double> [[COL_LOAD686]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP506:%.*]] = extractelement <2 x double> [[COL_LOAD691]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT701:%.*]] = insertelement <2 x double> poison, double [[TMP506]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT702:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT701]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP507:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK700]], <2 x double> [[SPLAT_SPLAT702]], <2 x double> [[BLOCK699]])
-; CHECK-NEXT:    [[BLOCK703:%.*]] = shufflevector <2 x double> [[COL_LOAD688]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP508:%.*]] = extractelement <2 x double> [[COL_LOAD691]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT704:%.*]] = insertelement <2 x double> poison, double [[TMP508]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT705:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT704]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP509:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK703]], <2 x double> [[SPLAT_SPLAT705]], <2 x double> [[TMP507]])
-; CHECK-NEXT:    [[TMP510:%.*]] = shufflevector <2 x double> [[TMP509]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP511:%.*]] = shufflevector <2 x double> [[TMP497]], <2 x double> [[TMP510]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP512:%.*]] = getelementptr double, ptr [[C]], i64 32
-; CHECK-NEXT:    store <2 x double> [[TMP505]], ptr [[TMP512]], align 8
-; CHECK-NEXT:    [[VEC_GEP706:%.*]] = getelementptr double, ptr [[TMP512]], i64 8
-; CHECK-NEXT:    store <2 x double> [[TMP511]], ptr [[VEC_GEP706]], align 8
-; CHECK-NEXT:    [[TMP513:%.*]] = getelementptr double, ptr [[A]], i64 2
-; CHECK-NEXT:    [[COL_LOAD707:%.*]] = load <2 x double>, ptr [[TMP513]], align 8
-; CHECK-NEXT:    [[VEC_GEP708:%.*]] = getelementptr double, ptr [[TMP513]], i64 8
-; CHECK-NEXT:    [[COL_LOAD709:%.*]] = load <2 x double>, ptr [[VEC_GEP708]], align 8
-; CHECK-NEXT:    [[TMP514:%.*]] = getelementptr double, ptr [[B]], i64 32
-; CHECK-NEXT:    [[COL_LOAD710:%.*]] = load <2 x double>, ptr [[TMP514]], align 8
-; CHECK-NEXT:    [[VEC_GEP711:%.*]] = getelementptr double, ptr [[TMP514]], i64 8
-; CHECK-NEXT:    [[COL_LOAD712:%.*]] = load <2 x double>, ptr [[VEC_GEP711]], align 8
-; CHECK-NEXT:    [[BLOCK713:%.*]] = shufflevector <2 x double> [[COL_LOAD707]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP515:%.*]] = extractelement <2 x double> [[COL_LOAD710]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT714:%.*]] = insertelement <2 x double> poison, double [[TMP515]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT715:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT714]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP516:%.*]] = fmul contract <2 x double> [[BLOCK713]], [[SPLAT_SPLAT715]]
-; CHECK-NEXT:    [[BLOCK716:%.*]] = shufflevector <2 x double> [[COL_LOAD709]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP517:%.*]] = extractelement <2 x double> [[COL_LOAD710]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT717:%.*]] = insertelement <2 x double> poison, double [[TMP517]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT718:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT717]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP518:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK716]], <2 x double> [[SPLAT_SPLAT718]], <2 x double> [[TMP516]])
-; CHECK-NEXT:    [[TMP519:%.*]] = shufflevector <2 x double> [[TMP518]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP520:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP519]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK719:%.*]] = shufflevector <2 x double> [[COL_LOAD707]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP521:%.*]] = extractelement <2 x double> [[COL_LOAD712]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT720:%.*]] = insertelement <2 x double> poison, double [[TMP521]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT721:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT720]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP522:%.*]] = fmul contract <2 x double> [[BLOCK719]], [[SPLAT_SPLAT721]]
-; CHECK-NEXT:    [[BLOCK722:%.*]] = shufflevector <2 x double> [[COL_LOAD709]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP523:%.*]] = extractelement <2 x double> [[COL_LOAD712]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT723:%.*]] = insertelement <2 x double> poison, double [[TMP523]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT724:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT723]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP524:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK722]], <2 x double> [[SPLAT_SPLAT724]], <2 x double> [[TMP522]])
-; CHECK-NEXT:    [[TMP525:%.*]] = shufflevector <2 x double> [[TMP524]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP526:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP525]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP527:%.*]] = getelementptr double, ptr [[A]], i64 18
-; CHECK-NEXT:    [[COL_LOAD725:%.*]] = load <2 x double>, ptr [[TMP527]], align 8
-; CHECK-NEXT:    [[VEC_GEP726:%.*]] = getelementptr double, ptr [[TMP527]], i64 8
-; CHECK-NEXT:    [[COL_LOAD727:%.*]] = load <2 x double>, ptr [[VEC_GEP726]], align 8
-; CHECK-NEXT:    [[TMP528:%.*]] = getelementptr double, ptr [[B]], i64 34
-; CHECK-NEXT:    [[COL_LOAD728:%.*]] = load <2 x double>, ptr [[TMP528]], align 8
-; CHECK-NEXT:    [[VEC_GEP729:%.*]] = getelementptr double, ptr [[TMP528]], i64 8
-; CHECK-NEXT:    [[COL_LOAD730:%.*]] = load <2 x double>, ptr [[VEC_GEP729]], align 8
-; CHECK-NEXT:    [[BLOCK731:%.*]] = shufflevector <2 x double> [[TMP520]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK732:%.*]] = shufflevector <2 x double> [[COL_LOAD725]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP529:%.*]] = extractelement <2 x double> [[COL_LOAD728]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT733:%.*]] = insertelement <2 x double> poison, double [[TMP529]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT734:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT733]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP530:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK732]], <2 x double> [[SPLAT_SPLAT734]], <2 x double> [[BLOCK731]])
-; CHECK-NEXT:    [[BLOCK735:%.*]] = shufflevector <2 x double> [[COL_LOAD727]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP531:%.*]] = extractelement <2 x double> [[COL_LOAD728]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT736:%.*]] = insertelement <2 x double> poison, double [[TMP531]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT737:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT736]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP532:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK735]], <2 x double> [[SPLAT_SPLAT737]], <2 x double> [[TMP530]])
-; CHECK-NEXT:    [[TMP533:%.*]] = shufflevector <2 x double> [[TMP532]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP534:%.*]] = shufflevector <2 x double> [[TMP520]], <2 x double> [[TMP533]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK738:%.*]] = shufflevector <2 x double> [[TMP526]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK739:%.*]] = shufflevector <2 x double> [[COL_LOAD725]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP535:%.*]] = extractelement <2 x double> [[COL_LOAD730]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT740:%.*]] = insertelement <2 x double> poison, double [[TMP535]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT741:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT740]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP536:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK739]], <2 x double> [[SPLAT_SPLAT741]], <2 x double> [[BLOCK738]])
-; CHECK-NEXT:    [[BLOCK742:%.*]] = shufflevector <2 x double> [[COL_LOAD727]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP537:%.*]] = extractelement <2 x double> [[COL_LOAD730]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT743:%.*]] = insertelement <2 x double> poison, double [[TMP537]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT744:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT743]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP538:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK742]], <2 x double> [[SPLAT_SPLAT744]], <2 x double> [[TMP536]])
-; CHECK-NEXT:    [[TMP539:%.*]] = shufflevector <2 x double> [[TMP538]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP540:%.*]] = shufflevector <2 x double> [[TMP526]], <2 x double> [[TMP539]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP541:%.*]] = getelementptr double, ptr [[A]], i64 34
-; CHECK-NEXT:    [[COL_LOAD745:%.*]] = load <2 x double>, ptr [[TMP541]], align 8
-; CHECK-NEXT:    [[VEC_GEP746:%.*]] = getelementptr double, ptr [[TMP541]], i64 8
-; CHECK-NEXT:    [[COL_LOAD747:%.*]] = load <2 x double>, ptr [[VEC_GEP746]], align 8
-; CHECK-NEXT:    [[TMP542:%.*]] = getelementptr double, ptr [[B]], i64 36
-; CHECK-NEXT:    [[COL_LOAD748:%.*]] = load <2 x double>, ptr [[TMP542]], align 8
-; CHECK-NEXT:    [[VEC_GEP749:%.*]] = getelementptr double, ptr [[TMP542]], i64 8
-; CHECK-NEXT:    [[COL_LOAD750:%.*]] = load <2 x double>, ptr [[VEC_GEP749]], align 8
-; CHECK-NEXT:    [[BLOCK751:%.*]] = shufflevector <2 x double> [[TMP534]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK752:%.*]] = shufflevector <2 x double> [[COL_LOAD745]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP543:%.*]] = extractelement <2 x double> [[COL_LOAD748]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT753:%.*]] = insertelement <2 x double> poison, double [[TMP543]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT754:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT753]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP544:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK752]], <2 x double> [[SPLAT_SPLAT754]], <2 x double> [[BLOCK751]])
-; CHECK-NEXT:    [[BLOCK755:%.*]] = shufflevector <2 x double> [[COL_LOAD747]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP545:%.*]] = extractelement <2 x double> [[COL_LOAD748]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT756:%.*]] = insertelement <2 x double> poison, double [[TMP545]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT757:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT756]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP546:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK755]], <2 x double> [[SPLAT_SPLAT757]], <2 x double> [[TMP544]])
-; CHECK-NEXT:    [[TMP547:%.*]] = shufflevector <2 x double> [[TMP546]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP548:%.*]] = shufflevector <2 x double> [[TMP534]], <2 x double> [[TMP547]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK758:%.*]] = shufflevector <2 x double> [[TMP540]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK759:%.*]] = shufflevector <2 x double> [[COL_LOAD745]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP549:%.*]] = extractelement <2 x double> [[COL_LOAD750]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT760:%.*]] = insertelement <2 x double> poison, double [[TMP549]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT761:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT760]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP550:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK759]], <2 x double> [[SPLAT_SPLAT761]], <2 x double> [[BLOCK758]])
-; CHECK-NEXT:    [[BLOCK762:%.*]] = shufflevector <2 x double> [[COL_LOAD747]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP551:%.*]] = extractelement <2 x double> [[COL_LOAD750]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT763:%.*]] = insertelement <2 x double> poison, double [[TMP551]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT764:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT763]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP552:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK762]], <2 x double> [[SPLAT_SPLAT764]], <2 x double> [[TMP550]])
-; CHECK-NEXT:    [[TMP553:%.*]] = shufflevector <2 x double> [[TMP552]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP554:%.*]] = shufflevector <2 x double> [[TMP540]], <2 x double> [[TMP553]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP555:%.*]] = getelementptr double, ptr [[A]], i64 50
-; CHECK-NEXT:    [[COL_LOAD765:%.*]] = load <2 x double>, ptr [[TMP555]], align 8
-; CHECK-NEXT:    [[VEC_GEP766:%.*]] = getelementptr double, ptr [[TMP555]], i64 8
-; CHECK-NEXT:    [[COL_LOAD767:%.*]] = load <2 x double>, ptr [[VEC_GEP766]], align 8
-; CHECK-NEXT:    [[TMP556:%.*]] = getelementptr double, ptr [[B]], i64 38
-; CHECK-NEXT:    [[COL_LOAD768:%.*]] = load <2 x double>, ptr [[TMP556]], align 8
-; CHECK-NEXT:    [[VEC_GEP769:%.*]] = getelementptr double, ptr [[TMP556]], i64 8
-; CHECK-NEXT:    [[COL_LOAD770:%.*]] = load <2 x double>, ptr [[VEC_GEP769]], align 8
-; CHECK-NEXT:    [[BLOCK771:%.*]] = shufflevector <2 x double> [[TMP548]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK772:%.*]] = shufflevector <2 x double> [[COL_LOAD765]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP557:%.*]] = extractelement <2 x double> [[COL_LOAD768]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT773:%.*]] = insertelement <2 x double> poison, double [[TMP557]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT774:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT773]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP558:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK772]], <2 x double> [[SPLAT_SPLAT774]], <2 x double> [[BLOCK771]])
-; CHECK-NEXT:    [[BLOCK775:%.*]] = shufflevector <2 x double> [[COL_LOAD767]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP559:%.*]] = extractelement <2 x double> [[COL_LOAD768]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT776:%.*]] = insertelement <2 x double> poison, double [[TMP559]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT777:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT776]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP560:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK775]], <2 x double> [[SPLAT_SPLAT777]], <2 x double> [[TMP558]])
-; CHECK-NEXT:    [[TMP561:%.*]] = shufflevector <2 x double> [[TMP560]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP562:%.*]] = shufflevector <2 x double> [[TMP548]], <2 x double> [[TMP561]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK778:%.*]] = shufflevector <2 x double> [[TMP554]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK779:%.*]] = shufflevector <2 x double> [[COL_LOAD765]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP563:%.*]] = extractelement <2 x double> [[COL_LOAD770]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT780:%.*]] = insertelement <2 x double> poison, double [[TMP563]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT781:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT780]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP564:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK779]], <2 x double> [[SPLAT_SPLAT781]], <2 x double> [[BLOCK778]])
-; CHECK-NEXT:    [[BLOCK782:%.*]] = shufflevector <2 x double> [[COL_LOAD767]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP565:%.*]] = extractelement <2 x double> [[COL_LOAD770]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT783:%.*]] = insertelement <2 x double> poison, double [[TMP565]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT784:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT783]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP566:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK782]], <2 x double> [[SPLAT_SPLAT784]], <2 x double> [[TMP564]])
-; CHECK-NEXT:    [[TMP567:%.*]] = shufflevector <2 x double> [[TMP566]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP568:%.*]] = shufflevector <2 x double> [[TMP554]], <2 x double> [[TMP567]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP569:%.*]] = getelementptr double, ptr [[C]], i64 34
-; CHECK-NEXT:    store <2 x double> [[TMP562]], ptr [[TMP569]], align 8
-; CHECK-NEXT:    [[VEC_GEP785:%.*]] = getelementptr double, ptr [[TMP569]], i64 8
-; CHECK-NEXT:    store <2 x double> [[TMP568]], ptr [[VEC_GEP785]], align 8
-; CHECK-NEXT:    [[TMP570:%.*]] = getelementptr double, ptr [[A]], i64 4
-; CHECK-NEXT:    [[COL_LOAD786:%.*]] = load <2 x double>, ptr [[TMP570]], align 8
-; CHECK-NEXT:    [[VEC_GEP787:%.*]] = getelementptr double, ptr [[TMP570]], i64 8
-; CHECK-NEXT:    [[COL_LOAD788:%.*]] = load <2 x double>, ptr [[VEC_GEP787]], align 8
-; CHECK-NEXT:    [[TMP571:%.*]] = getelementptr double, ptr [[B]], i64 32
-; CHECK-NEXT:    [[COL_LOAD789:%.*]] = load <2 x double>, ptr [[TMP571]], align 8
-; CHECK-NEXT:    [[VEC_GEP790:%.*]] = getelementptr double, ptr [[TMP571]], i64 8
-; CHECK-NEXT:    [[COL_LOAD791:%.*]] = load <2 x double>, ptr [[VEC_GEP790]], align 8
-; CHECK-NEXT:    [[BLOCK792:%.*]] = shufflevector <2 x double> [[COL_LOAD786]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP572:%.*]] = extractelement <2 x double> [[COL_LOAD789]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT793:%.*]] = insertelement <2 x double> poison, double [[TMP572]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT794:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT793]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP573:%.*]] = fmul contract <2 x double> [[BLOCK792]], [[SPLAT_SPLAT794]]
-; CHECK-NEXT:    [[BLOCK795:%.*]] = shufflevector <2 x double> [[COL_LOAD788]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP574:%.*]] = extractelement <2 x double> [[COL_LOAD789]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT796:%.*]] = insertelement <2 x double> poison, double [[TMP574]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT797:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT796]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP575:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK795]], <2 x double> [[SPLAT_SPLAT797]], <2 x double> [[TMP573]])
-; CHECK-NEXT:    [[TMP576:%.*]] = shufflevector <2 x double> [[TMP575]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP577:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP576]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK798:%.*]] = shufflevector <2 x double> [[COL_LOAD786]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP578:%.*]] = extractelement <2 x double> [[COL_LOAD791]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT799:%.*]] = insertelement <2 x double> poison, double [[TMP578]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT800:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT799]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP579:%.*]] = fmul contract <2 x double> [[BLOCK798]], [[SPLAT_SPLAT800]]
-; CHECK-NEXT:    [[BLOCK801:%.*]] = shufflevector <2 x double> [[COL_LOAD788]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP580:%.*]] = extractelement <2 x double> [[COL_LOAD791]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT802:%.*]] = insertelement <2 x double> poison, double [[TMP580]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT803:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT802]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP581:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK801]], <2 x double> [[SPLAT_SPLAT803]], <2 x double> [[TMP579]])
-; CHECK-NEXT:    [[TMP582:%.*]] = shufflevector <2 x double> [[TMP581]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP583:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP582]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP584:%.*]] = getelementptr double, ptr [[A]], i64 20
-; CHECK-NEXT:    [[COL_LOAD804:%.*]] = load <2 x double>, ptr [[TMP584]], align 8
-; CHECK-NEXT:    [[VEC_GEP805:%.*]] = getelementptr double, ptr [[TMP584]], i64 8
-; CHECK-NEXT:    [[COL_LOAD806:%.*]] = load <2 x double>, ptr [[VEC_GEP805]], align 8
-; CHECK-NEXT:    [[TMP585:%.*]] = getelementptr double, ptr [[B]], i64 34
-; CHECK-NEXT:    [[COL_LOAD807:%.*]] = load <2 x double>, ptr [[TMP585]], align 8
-; CHECK-NEXT:    [[VEC_GEP808:%.*]] = getelementptr double, ptr [[TMP585]], i64 8
-; CHECK-NEXT:    [[COL_LOAD809:%.*]] = load <2 x double>, ptr [[VEC_GEP808]], align 8
-; CHECK-NEXT:    [[BLOCK810:%.*]] = shufflevector <2 x double> [[TMP577]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK811:%.*]] = shufflevector <2 x double> [[COL_LOAD804]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP586:%.*]] = extractelement <2 x double> [[COL_LOAD807]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT812:%.*]] = insertelement <2 x double> poison, double [[TMP586]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT813:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT812]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP587:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK811]], <2 x double> [[SPLAT_SPLAT813]], <2 x double> [[BLOCK810]])
-; CHECK-NEXT:    [[BLOCK814:%.*]] = shufflevector <2 x double> [[COL_LOAD806]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP588:%.*]] = extractelement <2 x double> [[COL_LOAD807]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT815:%.*]] = insertelement <2 x double> poison, double [[TMP588]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT816:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT815]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP589:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK814]], <2 x double> [[SPLAT_SPLAT816]], <2 x double> [[TMP587]])
-; CHECK-NEXT:    [[TMP590:%.*]] = shufflevector <2 x double> [[TMP589]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP591:%.*]] = shufflevector <2 x double> [[TMP577]], <2 x double> [[TMP590]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK817:%.*]] = shufflevector <2 x double> [[TMP583]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK818:%.*]] = shufflevector <2 x double> [[COL_LOAD804]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP592:%.*]] = extractelement <2 x double> [[COL_LOAD809]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT819:%.*]] = insertelement <2 x double> poison, double [[TMP592]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT820:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT819]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP593:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK818]], <2 x double> [[SPLAT_SPLAT820]], <2 x double> [[BLOCK817]])
-; CHECK-NEXT:    [[BLOCK821:%.*]] = shufflevector <2 x double> [[COL_LOAD806]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP594:%.*]] = extractelement <2 x double> [[COL_LOAD809]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT822:%.*]] = insertelement <2 x double> poison, double [[TMP594]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT823:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT822]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP595:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK821]], <2 x double> [[SPLAT_SPLAT823]], <2 x double> [[TMP593]])
-; CHECK-NEXT:    [[TMP596:%.*]] = shufflevector <2 x double> [[TMP595]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP597:%.*]] = shufflevector <2 x double> [[TMP583]], <2 x double> [[TMP596]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP598:%.*]] = getelementptr double, ptr [[A]], i64 36
-; CHECK-NEXT:    [[COL_LOAD824:%.*]] = load <2 x double>, ptr [[TMP598]], align 8
-; CHECK-NEXT:    [[VEC_GEP825:%.*]] = getelementptr double, ptr [[TMP598]], i64 8
-; CHECK-NEXT:    [[COL_LOAD826:%.*]] = load <2 x double>, ptr [[VEC_GEP825]], align 8
-; CHECK-NEXT:    [[TMP599:%.*]] = getelementptr double, ptr [[B]], i64 36
-; CHECK-NEXT:    [[COL_LOAD827:%.*]] = load <2 x double>, ptr [[TMP599]], align 8
-; CHECK-NEXT:    [[VEC_GEP828:%.*]] = getelementptr double, ptr [[TMP599]], i64 8
-; CHECK-NEXT:    [[COL_LOAD829:%.*]] = load <2 x double>, ptr [[VEC_GEP828]], align 8
-; CHECK-NEXT:    [[BLOCK830:%.*]] = shufflevector <2 x double> [[TMP591]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK831:%.*]] = shufflevector <2 x double> [[COL_LOAD824]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP600:%.*]] = extractelement <2 x double> [[COL_LOAD827]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT832:%.*]] = insertelement <2 x double> poison, double [[TMP600]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT833:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT832]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP601:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK831]], <2 x double> [[SPLAT_SPLAT833]], <2 x double> [[BLOCK830]])
-; CHECK-NEXT:    [[BLOCK834:%.*]] = shufflevector <2 x double> [[COL_LOAD826]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP602:%.*]] = extractelement <2 x double> [[COL_LOAD827]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT835:%.*]] = insertelement <2 x double> poison, double [[TMP602]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT836:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT835]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP603:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK834]], <2 x double> [[SPLAT_SPLAT836]], <2 x double> [[TMP601]])
-; CHECK-NEXT:    [[TMP604:%.*]] = shufflevector <2 x double> [[TMP603]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP605:%.*]] = shufflevector <2 x double> [[TMP591]], <2 x double> [[TMP604]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK837:%.*]] = shufflevector <2 x double> [[TMP597]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK838:%.*]] = shufflevector <2 x double> [[COL_LOAD824]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP606:%.*]] = extractelement <2 x double> [[COL_LOAD829]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT839:%.*]] = insertelement <2 x double> poison, double [[TMP606]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT840:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT839]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP607:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK838]], <2 x double> [[SPLAT_SPLAT840]], <2 x double> [[BLOCK837]])
-; CHECK-NEXT:    [[BLOCK841:%.*]] = shufflevector <2 x double> [[COL_LOAD826]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP608:%.*]] = extractelement <2 x double> [[COL_LOAD829]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT842:%.*]] = insertelement <2 x double> poison, double [[TMP608]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT843:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT842]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP609:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK841]], <2 x double> [[SPLAT_SPLAT843]], <2 x double> [[TMP607]])
-; CHECK-NEXT:    [[TMP610:%.*]] = shufflevector <2 x double> [[TMP609]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP611:%.*]] = shufflevector <2 x double> [[TMP597]], <2 x double> [[TMP610]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP612:%.*]] = getelementptr double, ptr [[A]], i64 52
-; CHECK-NEXT:    [[COL_LOAD844:%.*]] = load <2 x double>, ptr [[TMP612]], align 8
-; CHECK-NEXT:    [[VEC_GEP845:%.*]] = getelementptr double, ptr [[TMP612]], i64 8
-; CHECK-NEXT:    [[COL_LOAD846:%.*]] = load <2 x double>, ptr [[VEC_GEP845]], align 8
-; CHECK-NEXT:    [[TMP613:%.*]] = getelementptr double, ptr [[B]], i64 38
-; CHECK-NEXT:    [[COL_LOAD847:%.*]] = load <2 x double>, ptr [[TMP613]], align 8
-; CHECK-NEXT:    [[VEC_GEP848:%.*]] = getelementptr double, ptr [[TMP613]], i64 8
-; CHECK-NEXT:    [[COL_LOAD849:%.*]] = load <2 x double>, ptr [[VEC_GEP848]], align 8
-; CHECK-NEXT:    [[BLOCK850:%.*]] = shufflevector <2 x double> [[TMP605]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK851:%.*]] = shufflevector <2 x double> [[COL_LOAD844]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP614:%.*]] = extractelement <2 x double> [[COL_LOAD847]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT852:%.*]] = insertelement <2 x double> poison, double [[TMP614]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT853:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT852]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP615:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK851]], <2 x double> [[SPLAT_SPLAT853]], <2 x double> [[BLOCK850]])
-; CHECK-NEXT:    [[BLOCK854:%.*]] = shufflevector <2 x double> [[COL_LOAD846]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP616:%.*]] = extractelement <2 x double> [[COL_LOAD847]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT855:%.*]] = insertelement <2 x double> poison, double [[TMP616]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT856:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT855]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP617:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK854]], <2 x double> [[SPLAT_SPLAT856]], <2 x double> [[TMP615]])
-; CHECK-NEXT:    [[TMP618:%.*]] = shufflevector <2 x double> [[TMP617]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP619:%.*]] = shufflevector <2 x double> [[TMP605]], <2 x double> [[TMP618]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK857:%.*]] = shufflevector <2 x double> [[TMP611]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK858:%.*]] = shufflevector <2 x double> [[COL_LOAD844]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP620:%.*]] = extractelement <2 x double> [[COL_LOAD849]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT859:%.*]] = insertelement <2 x double> poison, double [[TMP620]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT860:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT859]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP621:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK858]], <2 x double> [[SPLAT_SPLAT860]], <2 x double> [[BLOCK857]])
-; CHECK-NEXT:    [[BLOCK861:%.*]] = shufflevector <2 x double> [[COL_LOAD846]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP622:%.*]] = extractelement <2 x double> [[COL_LOAD849]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT862:%.*]] = insertelement <2 x double> poison, double [[TMP622]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT863:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT862]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP623:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK861]], <2 x double> [[SPLAT_SPLAT863]], <2 x double> [[TMP621]])
-; CHECK-NEXT:    [[TMP624:%.*]] = shufflevector <2 x double> [[TMP623]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP625:%.*]] = shufflevector <2 x double> [[TMP611]], <2 x double> [[TMP624]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP626:%.*]] = getelementptr double, ptr [[C]], i64 36
-; CHECK-NEXT:    store <2 x double> [[TMP619]], ptr [[TMP626]], align 8
-; CHECK-NEXT:    [[VEC_GEP864:%.*]] = getelementptr double, ptr [[TMP626]], i64 8
-; CHECK-NEXT:    store <2 x double> [[TMP625]], ptr [[VEC_GEP864]], align 8
-; CHECK-NEXT:    [[TMP627:%.*]] = getelementptr double, ptr [[A]], i64 6
-; CHECK-NEXT:    [[COL_LOAD865:%.*]] = load <2 x double>, ptr [[TMP627]], align 8
-; CHECK-NEXT:    [[VEC_GEP866:%.*]] = getelementptr double, ptr [[TMP627]], i64 8
-; CHECK-NEXT:    [[COL_LOAD867:%.*]] = load <2 x double>, ptr [[VEC_GEP866]], align 8
-; CHECK-NEXT:    [[TMP628:%.*]] = getelementptr double, ptr [[B]], i64 32
-; CHECK-NEXT:    [[COL_LOAD868:%.*]] = load <2 x double>, ptr [[TMP628]], align 8
-; CHECK-NEXT:    [[VEC_GEP869:%.*]] = getelementptr double, ptr [[TMP628]], i64 8
-; CHECK-NEXT:    [[COL_LOAD870:%.*]] = load <2 x double>, ptr [[VEC_GEP869]], align 8
-; CHECK-NEXT:    [[BLOCK871:%.*]] = shufflevector <2 x double> [[COL_LOAD865]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP629:%.*]] = extractelement <2 x double> [[COL_LOAD868]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT872:%.*]] = insertelement <2 x double> poison, double [[TMP629]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT873:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT872]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP630:%.*]] = fmul contract <2 x double> [[BLOCK871]], [[SPLAT_SPLAT873]]
-; CHECK-NEXT:    [[BLOCK874:%.*]] = shufflevector <2 x double> [[COL_LOAD867]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP631:%.*]] = extractelement <2 x double> [[COL_LOAD868]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT875:%.*]] = insertelement <2 x double> poison, double [[TMP631]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT876:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT875]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP632:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK874]], <2 x double> [[SPLAT_SPLAT876]], <2 x double> [[TMP630]])
-; CHECK-NEXT:    [[TMP633:%.*]] = shufflevector <2 x double> [[TMP632]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP634:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP633]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK877:%.*]] = shufflevector <2 x double> [[COL_LOAD865]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP635:%.*]] = extractelement <2 x double> [[COL_LOAD870]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT878:%.*]] = insertelement <2 x double> poison, double [[TMP635]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT879:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT878]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP636:%.*]] = fmul contract <2 x double> [[BLOCK877]], [[SPLAT_SPLAT879]]
-; CHECK-NEXT:    [[BLOCK880:%.*]] = shufflevector <2 x double> [[COL_LOAD867]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP637:%.*]] = extractelement <2 x double> [[COL_LOAD870]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT881:%.*]] = insertelement <2 x double> poison, double [[TMP637]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT882:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT881]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP638:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK880]], <2 x double> [[SPLAT_SPLAT882]], <2 x double> [[TMP636]])
-; CHECK-NEXT:    [[TMP639:%.*]] = shufflevector <2 x double> [[TMP638]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP640:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP639]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP641:%.*]] = getelementptr double, ptr [[A]], i64 22
-; CHECK-NEXT:    [[COL_LOAD883:%.*]] = load <2 x double>, ptr [[TMP641]], align 8
-; CHECK-NEXT:    [[VEC_GEP884:%.*]] = getelementptr double, ptr [[TMP641]], i64 8
-; CHECK-NEXT:    [[COL_LOAD885:%.*]] = load <2 x double>, ptr [[VEC_GEP884]], align 8
-; CHECK-NEXT:    [[TMP642:%.*]] = getelementptr double, ptr [[B]], i64 34
-; CHECK-NEXT:    [[COL_LOAD886:%.*]] = load <2 x double>, ptr [[TMP642]], align 8
-; CHECK-NEXT:    [[VEC_GEP887:%.*]] = getelementptr double, ptr [[TMP642]], i64 8
-; CHECK-NEXT:    [[COL_LOAD888:%.*]] = load <2 x double>, ptr [[VEC_GEP887]], align 8
-; CHECK-NEXT:    [[BLOCK889:%.*]] = shufflevector <2 x double> [[TMP634]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK890:%.*]] = shufflevector <2 x double> [[COL_LOAD883]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP643:%.*]] = extractelement <2 x double> [[COL_LOAD886]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT891:%.*]] = insertelement <2 x double> poison, double [[TMP643]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT892:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT891]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP644:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK890]], <2 x double> [[SPLAT_SPLAT892]], <2 x double> [[BLOCK889]])
-; CHECK-NEXT:    [[BLOCK893:%.*]] = shufflevector <2 x double> [[COL_LOAD885]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP645:%.*]] = extractelement <2 x double> [[COL_LOAD886]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT894:%.*]] = insertelement <2 x double> poison, double [[TMP645]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT895:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT894]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP646:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK893]], <2 x double> [[SPLAT_SPLAT895]], <2 x double> [[TMP644]])
-; CHECK-NEXT:    [[TMP647:%.*]] = shufflevector <2 x double> [[TMP646]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP648:%.*]] = shufflevector <2 x double> [[TMP634]], <2 x double> [[TMP647]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK896:%.*]] = shufflevector <2 x double> [[TMP640]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK897:%.*]] = shufflevector <2 x double> [[COL_LOAD883]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP649:%.*]] = extractelement <2 x double> [[COL_LOAD888]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT898:%.*]] = insertelement <2 x double> poison, double [[TMP649]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT899:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT898]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP650:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK897]], <2 x double> [[SPLAT_SPLAT899]], <2 x double> [[BLOCK896]])
-; CHECK-NEXT:    [[BLOCK900:%.*]] = shufflevector <2 x double> [[COL_LOAD885]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP651:%.*]] = extractelement <2 x double> [[COL_LOAD888]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT901:%.*]] = insertelement <2 x double> poison, double [[TMP651]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT902:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT901]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP652:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK900]], <2 x double> [[SPLAT_SPLAT902]], <2 x double> [[TMP650]])
-; CHECK-NEXT:    [[TMP653:%.*]] = shufflevector <2 x double> [[TMP652]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP654:%.*]] = shufflevector <2 x double> [[TMP640]], <2 x double> [[TMP653]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP655:%.*]] = getelementptr double, ptr [[A]], i64 38
-; CHECK-NEXT:    [[COL_LOAD903:%.*]] = load <2 x double>, ptr [[TMP655]], align 8
-; CHECK-NEXT:    [[VEC_GEP904:%.*]] = getelementptr double, ptr [[TMP655]], i64 8
-; CHECK-NEXT:    [[COL_LOAD905:%.*]] = load <2 x double>, ptr [[VEC_GEP904]], align 8
-; CHECK-NEXT:    [[TMP656:%.*]] = getelementptr double, ptr [[B]], i64 36
-; CHECK-NEXT:    [[COL_LOAD906:%.*]] = load <2 x double>, ptr [[TMP656]], align 8
-; CHECK-NEXT:    [[VEC_GEP907:%.*]] = getelementptr double, ptr [[TMP656]], i64 8
-; CHECK-NEXT:    [[COL_LOAD908:%.*]] = load <2 x double>, ptr [[VEC_GEP907]], align 8
-; CHECK-NEXT:    [[BLOCK909:%.*]] = shufflevector <2 x double> [[TMP648]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK910:%.*]] = shufflevector <2 x double> [[COL_LOAD903]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP657:%.*]] = extractelement <2 x double> [[COL_LOAD906]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT911:%.*]] = insertelement <2 x double> poison, double [[TMP657]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT912:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT911]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP658:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK910]], <2 x double> [[SPLAT_SPLAT912]], <2 x double> [[BLOCK909]])
-; CHECK-NEXT:    [[BLOCK913:%.*]] = shufflevector <2 x double> [[COL_LOAD905]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP659:%.*]] = extractelement <2 x double> [[COL_LOAD906]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT914:%.*]] = insertelement <2 x double> poison, double [[TMP659]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT915:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT914]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP660:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK913]], <2 x double> [[SPLAT_SPLAT915]], <2 x double> [[TMP658]])
-; CHECK-NEXT:    [[TMP661:%.*]] = shufflevector <2 x double> [[TMP660]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP662:%.*]] = shufflevector <2 x double> [[TMP648]], <2 x double> [[TMP661]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK916:%.*]] = shufflevector <2 x double> [[TMP654]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK917:%.*]] = shufflevector <2 x double> [[COL_LOAD903]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP663:%.*]] = extractelement <2 x double> [[COL_LOAD908]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT918:%.*]] = insertelement <2 x double> poison, double [[TMP663]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT919:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT918]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP664:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK917]], <2 x double> [[SPLAT_SPLAT919]], <2 x double> [[BLOCK916]])
-; CHECK-NEXT:    [[BLOCK920:%.*]] = shufflevector <2 x double> [[COL_LOAD905]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP665:%.*]] = extractelement <2 x double> [[COL_LOAD908]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT921:%.*]] = insertelement <2 x double> poison, double [[TMP665]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT922:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT921]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP666:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK920]], <2 x double> [[SPLAT_SPLAT922]], <2 x double> [[TMP664]])
-; CHECK-NEXT:    [[TMP667:%.*]] = shufflevector <2 x double> [[TMP666]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP668:%.*]] = shufflevector <2 x double> [[TMP654]], <2 x double> [[TMP667]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP669:%.*]] = getelementptr double, ptr [[A]], i64 54
-; CHECK-NEXT:    [[COL_LOAD923:%.*]] = load <2 x double>, ptr [[TMP669]], align 8
-; CHECK-NEXT:    [[VEC_GEP924:%.*]] = getelementptr double, ptr [[TMP669]], i64 8
-; CHECK-NEXT:    [[COL_LOAD925:%.*]] = load <2 x double>, ptr [[VEC_GEP924]], align 8
-; CHECK-NEXT:    [[TMP670:%.*]] = getelementptr double, ptr [[B]], i64 38
-; CHECK-NEXT:    [[COL_LOAD926:%.*]] = load <2 x double>, ptr [[TMP670]], align 8
-; CHECK-NEXT:    [[VEC_GEP927:%.*]] = getelementptr double, ptr [[TMP670]], i64 8
-; CHECK-NEXT:    [[COL_LOAD928:%.*]] = load <2 x double>, ptr [[VEC_GEP927]], align 8
-; CHECK-NEXT:    [[BLOCK929:%.*]] = shufflevector <2 x double> [[TMP662]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK930:%.*]] = shufflevector <2 x double> [[COL_LOAD923]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP671:%.*]] = extractelement <2 x double> [[COL_LOAD926]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT931:%.*]] = insertelement <2 x double> poison, double [[TMP671]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT932:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT931]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP672:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK930]], <2 x double> [[SPLAT_SPLAT932]], <2 x double> [[BLOCK929]])
-; CHECK-NEXT:    [[BLOCK933:%.*]] = shufflevector <2 x double> [[COL_LOAD925]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP673:%.*]] = extractelement <2 x double> [[COL_LOAD926]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT934:%.*]] = insertelement <2 x double> poison, double [[TMP673]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT935:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT934]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP674:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK933]], <2 x double> [[SPLAT_SPLAT935]], <2 x double> [[TMP672]])
-; CHECK-NEXT:    [[TMP675:%.*]] = shufflevector <2 x double> [[TMP674]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP676:%.*]] = shufflevector <2 x double> [[TMP662]], <2 x double> [[TMP675]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK936:%.*]] = shufflevector <2 x double> [[TMP668]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK937:%.*]] = shufflevector <2 x double> [[COL_LOAD923]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP677:%.*]] = extractelement <2 x double> [[COL_LOAD928]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT938:%.*]] = insertelement <2 x double> poison, double [[TMP677]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT939:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT938]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP678:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK937]], <2 x double> [[SPLAT_SPLAT939]], <2 x double> [[BLOCK936]])
-; CHECK-NEXT:    [[BLOCK940:%.*]] = shufflevector <2 x double> [[COL_LOAD925]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP679:%.*]] = extractelement <2 x double> [[COL_LOAD928]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT941:%.*]] = insertelement <2 x double> poison, double [[TMP679]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT942:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT941]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP680:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK940]], <2 x double> [[SPLAT_SPLAT942]], <2 x double> [[TMP678]])
-; CHECK-NEXT:    [[TMP681:%.*]] = shufflevector <2 x double> [[TMP680]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP682:%.*]] = shufflevector <2 x double> [[TMP668]], <2 x double> [[TMP681]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP683:%.*]] = getelementptr double, ptr [[C]], i64 38
-; CHECK-NEXT:    store <2 x double> [[TMP676]], ptr [[TMP683]], align 8
-; CHECK-NEXT:    [[VEC_GEP943:%.*]] = getelementptr double, ptr [[TMP683]], i64 8
-; CHECK-NEXT:    store <2 x double> [[TMP682]], ptr [[VEC_GEP943]], align 8
-; CHECK-NEXT:    [[TMP684:%.*]] = getelementptr double, ptr [[A]], i64 0
-; CHECK-NEXT:    [[COL_LOAD944:%.*]] = load <2 x double>, ptr [[TMP684]], align 8
-; CHECK-NEXT:    [[VEC_GEP945:%.*]] = getelementptr double, ptr [[TMP684]], i64 8
-; CHECK-NEXT:    [[COL_LOAD946:%.*]] = load <2 x double>, ptr [[VEC_GEP945]], align 8
-; CHECK-NEXT:    [[TMP685:%.*]] = getelementptr double, ptr [[B]], i64 48
-; CHECK-NEXT:    [[COL_LOAD947:%.*]] = load <2 x double>, ptr [[TMP685]], align 8
-; CHECK-NEXT:    [[VEC_GEP948:%.*]] = getelementptr double, ptr [[TMP685]], i64 8
-; CHECK-NEXT:    [[COL_LOAD949:%.*]] = load <2 x double>, ptr [[VEC_GEP948]], align 8
-; CHECK-NEXT:    [[BLOCK950:%.*]] = shufflevector <2 x double> [[COL_LOAD944]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP686:%.*]] = extractelement <2 x double> [[COL_LOAD947]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT951:%.*]] = insertelement <2 x double> poison, double [[TMP686]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT952:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT951]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP687:%.*]] = fmul contract <2 x double> [[BLOCK950]], [[SPLAT_SPLAT952]]
-; CHECK-NEXT:    [[BLOCK953:%.*]] = shufflevector <2 x double> [[COL_LOAD946]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP688:%.*]] = extractelement <2 x double> [[COL_LOAD947]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT954:%.*]] = insertelement <2 x double> poison, double [[TMP688]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT955:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT954]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP689:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK953]], <2 x double> [[SPLAT_SPLAT955]], <2 x double> [[TMP687]])
-; CHECK-NEXT:    [[TMP690:%.*]] = shufflevector <2 x double> [[TMP689]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP691:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP690]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK956:%.*]] = shufflevector <2 x double> [[COL_LOAD944]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP692:%.*]] = extractelement <2 x double> [[COL_LOAD949]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT957:%.*]] = insertelement <2 x double> poison, double [[TMP692]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT958:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT957]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP693:%.*]] = fmul contract <2 x double> [[BLOCK956]], [[SPLAT_SPLAT958]]
-; CHECK-NEXT:    [[BLOCK959:%.*]] = shufflevector <2 x double> [[COL_LOAD946]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP694:%.*]] = extractelement <2 x double> [[COL_LOAD949]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT960:%.*]] = insertelement <2 x double> poison, double [[TMP694]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT961:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT960]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP695:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK959]], <2 x double> [[SPLAT_SPLAT961]], <2 x double> [[TMP693]])
-; CHECK-NEXT:    [[TMP696:%.*]] = shufflevector <2 x double> [[TMP695]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP697:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP696]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP698:%.*]] = getelementptr double, ptr [[A]], i64 16
-; CHECK-NEXT:    [[COL_LOAD962:%.*]] = load <2 x double>, ptr [[TMP698]], align 8
-; CHECK-NEXT:    [[VEC_GEP963:%.*]] = getelementptr double, ptr [[TMP698]], i64 8
-; CHECK-NEXT:    [[COL_LOAD964:%.*]] = load <2 x double>, ptr [[VEC_GEP963]], align 8
-; CHECK-NEXT:    [[TMP699:%.*]] = getelementptr double, ptr [[B]], i64 50
-; CHECK-NEXT:    [[COL_LOAD965:%.*]] = load <2 x double>, ptr [[TMP699]], align 8
-; CHECK-NEXT:    [[VEC_GEP966:%.*]] = getelementptr double, ptr [[TMP699]], i64 8
-; CHECK-NEXT:    [[COL_LOAD967:%.*]] = load <2 x double>, ptr [[VEC_GEP966]], align 8
-; CHECK-NEXT:    [[BLOCK968:%.*]] = shufflevector <2 x double> [[TMP691]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK969:%.*]] = shufflevector <2 x double> [[COL_LOAD962]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP700:%.*]] = extractelement <2 x double> [[COL_LOAD965]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT970:%.*]] = insertelement <2 x double> poison, double [[TMP700]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT971:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT970]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP701:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK969]], <2 x double> [[SPLAT_SPLAT971]], <2 x double> [[BLOCK968]])
-; CHECK-NEXT:    [[BLOCK972:%.*]] = shufflevector <2 x double> [[COL_LOAD964]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP702:%.*]] = extractelement <2 x double> [[COL_LOAD965]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT973:%.*]] = insertelement <2 x double> poison, double [[TMP702]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT974:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT973]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP703:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK972]], <2 x double> [[SPLAT_SPLAT974]], <2 x double> [[TMP701]])
-; CHECK-NEXT:    [[TMP704:%.*]] = shufflevector <2 x double> [[TMP703]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP705:%.*]] = shufflevector <2 x double> [[TMP691]], <2 x double> [[TMP704]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK975:%.*]] = shufflevector <2 x double> [[TMP697]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK976:%.*]] = shufflevector <2 x double> [[COL_LOAD962]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP706:%.*]] = extractelement <2 x double> [[COL_LOAD967]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT977:%.*]] = insertelement <2 x double> poison, double [[TMP706]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT978:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT977]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP707:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK976]], <2 x double> [[SPLAT_SPLAT978]], <2 x double> [[BLOCK975]])
-; CHECK-NEXT:    [[BLOCK979:%.*]] = shufflevector <2 x double> [[COL_LOAD964]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP708:%.*]] = extractelement <2 x double> [[COL_LOAD967]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT980:%.*]] = insertelement <2 x double> poison, double [[TMP708]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT981:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT980]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP709:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK979]], <2 x double> [[SPLAT_SPLAT981]], <2 x double> [[TMP707]])
-; CHECK-NEXT:    [[TMP710:%.*]] = shufflevector <2 x double> [[TMP709]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP711:%.*]] = shufflevector <2 x double> [[TMP697]], <2 x double> [[TMP710]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP712:%.*]] = getelementptr double, ptr [[A]], i64 32
-; CHECK-NEXT:    [[COL_LOAD982:%.*]] = load <2 x double>, ptr [[TMP712]], align 8
-; CHECK-NEXT:    [[VEC_GEP983:%.*]] = getelementptr double, ptr [[TMP712]], i64 8
-; CHECK-NEXT:    [[COL_LOAD984:%.*]] = load <2 x double>, ptr [[VEC_GEP983]], align 8
-; CHECK-NEXT:    [[TMP713:%.*]] = getelementptr double, ptr [[B]], i64 52
-; CHECK-NEXT:    [[COL_LOAD985:%.*]] = load <2 x double>, ptr [[TMP713]], align 8
-; CHECK-NEXT:    [[VEC_GEP986:%.*]] = getelementptr double, ptr [[TMP713]], i64 8
-; CHECK-NEXT:    [[COL_LOAD987:%.*]] = load <2 x double>, ptr [[VEC_GEP986]], align 8
-; CHECK-NEXT:    [[BLOCK988:%.*]] = shufflevector <2 x double> [[TMP705]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK989:%.*]] = shufflevector <2 x double> [[COL_LOAD982]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP714:%.*]] = extractelement <2 x double> [[COL_LOAD985]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT990:%.*]] = insertelement <2 x double> poison, double [[TMP714]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT991:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT990]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP715:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK989]], <2 x double> [[SPLAT_SPLAT991]], <2 x double> [[BLOCK988]])
-; CHECK-NEXT:    [[BLOCK992:%.*]] = shufflevector <2 x double> [[COL_LOAD984]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP716:%.*]] = extractelement <2 x double> [[COL_LOAD985]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT993:%.*]] = insertelement <2 x double> poison, double [[TMP716]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT994:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT993]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP717:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK992]], <2 x double> [[SPLAT_SPLAT994]], <2 x double> [[TMP715]])
-; CHECK-NEXT:    [[TMP718:%.*]] = shufflevector <2 x double> [[TMP717]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP719:%.*]] = shufflevector <2 x double> [[TMP705]], <2 x double> [[TMP718]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK995:%.*]] = shufflevector <2 x double> [[TMP711]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK996:%.*]] = shufflevector <2 x double> [[COL_LOAD982]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP720:%.*]] = extractelement <2 x double> [[COL_LOAD987]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT997:%.*]] = insertelement <2 x double> poison, double [[TMP720]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT998:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT997]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP721:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK996]], <2 x double> [[SPLAT_SPLAT998]], <2 x double> [[BLOCK995]])
-; CHECK-NEXT:    [[BLOCK999:%.*]] = shufflevector <2 x double> [[COL_LOAD984]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP722:%.*]] = extractelement <2 x double> [[COL_LOAD987]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1000:%.*]] = insertelement <2 x double> poison, double [[TMP722]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1001:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1000]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP723:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK999]], <2 x double> [[SPLAT_SPLAT1001]], <2 x double> [[TMP721]])
-; CHECK-NEXT:    [[TMP724:%.*]] = shufflevector <2 x double> [[TMP723]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP725:%.*]] = shufflevector <2 x double> [[TMP711]], <2 x double> [[TMP724]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP726:%.*]] = getelementptr double, ptr [[A]], i64 48
-; CHECK-NEXT:    [[COL_LOAD1002:%.*]] = load <2 x double>, ptr [[TMP726]], align 8
-; CHECK-NEXT:    [[VEC_GEP1003:%.*]] = getelementptr double, ptr [[TMP726]], i64 8
-; CHECK-NEXT:    [[COL_LOAD1004:%.*]] = load <2 x double>, ptr [[VEC_GEP1003]], align 8
-; CHECK-NEXT:    [[TMP727:%.*]] = getelementptr double, ptr [[B]], i64 54
-; CHECK-NEXT:    [[COL_LOAD1005:%.*]] = load <2 x double>, ptr [[TMP727]], align 8
-; CHECK-NEXT:    [[VEC_GEP1006:%.*]] = getelementptr double, ptr [[TMP727]], i64 8
-; CHECK-NEXT:    [[COL_LOAD1007:%.*]] = load <2 x double>, ptr [[VEC_GEP1006]], align 8
-; CHECK-NEXT:    [[BLOCK1008:%.*]] = shufflevector <2 x double> [[TMP719]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK1009:%.*]] = shufflevector <2 x double> [[COL_LOAD1002]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP728:%.*]] = extractelement <2 x double> [[COL_LOAD1005]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1010:%.*]] = insertelement <2 x double> poison, double [[TMP728]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1011:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1010]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP729:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1009]], <2 x double> [[SPLAT_SPLAT1011]], <2 x double> [[BLOCK1008]])
-; CHECK-NEXT:    [[BLOCK1012:%.*]] = shufflevector <2 x double> [[COL_LOAD1004]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP730:%.*]] = extractelement <2 x double> [[COL_LOAD1005]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1013:%.*]] = insertelement <2 x double> poison, double [[TMP730]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1014:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1013]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP731:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1012]], <2 x double> [[SPLAT_SPLAT1014]], <2 x double> [[TMP729]])
-; CHECK-NEXT:    [[TMP732:%.*]] = shufflevector <2 x double> [[TMP731]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP733:%.*]] = shufflevector <2 x double> [[TMP719]], <2 x double> [[TMP732]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK1015:%.*]] = shufflevector <2 x double> [[TMP725]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK1016:%.*]] = shufflevector <2 x double> [[COL_LOAD1002]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP734:%.*]] = extractelement <2 x double> [[COL_LOAD1007]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1017:%.*]] = insertelement <2 x double> poison, double [[TMP734]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1018:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1017]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP735:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1016]], <2 x double> [[SPLAT_SPLAT1018]], <2 x double> [[BLOCK1015]])
-; CHECK-NEXT:    [[BLOCK1019:%.*]] = shufflevector <2 x double> [[COL_LOAD1004]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP736:%.*]] = extractelement <2 x double> [[COL_LOAD1007]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1020:%.*]] = insertelement <2 x double> poison, double [[TMP736]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1021:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1020]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP737:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1019]], <2 x double> [[SPLAT_SPLAT1021]], <2 x double> [[TMP735]])
-; CHECK-NEXT:    [[TMP738:%.*]] = shufflevector <2 x double> [[TMP737]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP739:%.*]] = shufflevector <2 x double> [[TMP725]], <2 x double> [[TMP738]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP740:%.*]] = getelementptr double, ptr [[C]], i64 48
-; CHECK-NEXT:    store <2 x double> [[TMP733]], ptr [[TMP740]], align 8
-; CHECK-NEXT:    [[VEC_GEP1022:%.*]] = getelementptr double, ptr [[TMP740]], i64 8
-; CHECK-NEXT:    store <2 x double> [[TMP739]], ptr [[VEC_GEP1022]], align 8
-; CHECK-NEXT:    [[TMP741:%.*]] = getelementptr double, ptr [[A]], i64 2
-; CHECK-NEXT:    [[COL_LOAD1023:%.*]] = load <2 x double>, ptr [[TMP741]], align 8
-; CHECK-NEXT:    [[VEC_GEP1024:%.*]] = getelementptr double, ptr [[TMP741]], i64 8
-; CHECK-NEXT:    [[COL_LOAD1025:%.*]] = load <2 x double>, ptr [[VEC_GEP1024]], align 8
-; CHECK-NEXT:    [[TMP742:%.*]] = getelementptr double, ptr [[B]], i64 48
-; CHECK-NEXT:    [[COL_LOAD1026:%.*]] = load <2 x double>, ptr [[TMP742]], align 8
-; CHECK-NEXT:    [[VEC_GEP1027:%.*]] = getelementptr double, ptr [[TMP742]], i64 8
-; CHECK-NEXT:    [[COL_LOAD1028:%.*]] = load <2 x double>, ptr [[VEC_GEP1027]], align 8
-; CHECK-NEXT:    [[BLOCK1029:%.*]] = shufflevector <2 x double> [[COL_LOAD1023]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP743:%.*]] = extractelement <2 x double> [[COL_LOAD1026]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1030:%.*]] = insertelement <2 x double> poison, double [[TMP743]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1031:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1030]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP744:%.*]] = fmul contract <2 x double> [[BLOCK1029]], [[SPLAT_SPLAT1031]]
-; CHECK-NEXT:    [[BLOCK1032:%.*]] = shufflevector <2 x double> [[COL_LOAD1025]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP745:%.*]] = extractelement <2 x double> [[COL_LOAD1026]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1033:%.*]] = insertelement <2 x double> poison, double [[TMP745]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1034:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1033]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP746:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1032]], <2 x double> [[SPLAT_SPLAT1034]], <2 x double> [[TMP744]])
-; CHECK-NEXT:    [[TMP747:%.*]] = shufflevector <2 x double> [[TMP746]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP748:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP747]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK1035:%.*]] = shufflevector <2 x double> [[COL_LOAD1023]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP749:%.*]] = extractelement <2 x double> [[COL_LOAD1028]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1036:%.*]] = insertelement <2 x double> poison, double [[TMP749]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1037:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1036]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP750:%.*]] = fmul contract <2 x double> [[BLOCK1035]], [[SPLAT_SPLAT1037]]
-; CHECK-NEXT:    [[BLOCK1038:%.*]] = shufflevector <2 x double> [[COL_LOAD1025]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP751:%.*]] = extractelement <2 x double> [[COL_LOAD1028]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1039:%.*]] = insertelement <2 x double> poison, double [[TMP751]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1040:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1039]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP752:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1038]], <2 x double> [[SPLAT_SPLAT1040]], <2 x double> [[TMP750]])
-; CHECK-NEXT:    [[TMP753:%.*]] = shufflevector <2 x double> [[TMP752]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP754:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP753]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP755:%.*]] = getelementptr double, ptr [[A]], i64 18
-; CHECK-NEXT:    [[COL_LOAD1041:%.*]] = load <2 x double>, ptr [[TMP755]], align 8
-; CHECK-NEXT:    [[VEC_GEP1042:%.*]] = getelementptr double, ptr [[TMP755]], i64 8
-; CHECK-NEXT:    [[COL_LOAD1043:%.*]] = load <2 x double>, ptr [[VEC_GEP1042]], align 8
-; CHECK-NEXT:    [[TMP756:%.*]] = getelementptr double, ptr [[B]], i64 50
-; CHECK-NEXT:    [[COL_LOAD1044:%.*]] = load <2 x double>, ptr [[TMP756]], align 8
-; CHECK-NEXT:    [[VEC_GEP1045:%.*]] = getelementptr double, ptr [[TMP756]], i64 8
-; CHECK-NEXT:    [[COL_LOAD1046:%.*]] = load <2 x double>, ptr [[VEC_GEP1045]], align 8
-; CHECK-NEXT:    [[BLOCK1047:%.*]] = shufflevector <2 x double> [[TMP748]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK1048:%.*]] = shufflevector <2 x double> [[COL_LOAD1041]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP757:%.*]] = extractelement <2 x double> [[COL_LOAD1044]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1049:%.*]] = insertelement <2 x double> poison, double [[TMP757]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1050:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1049]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP758:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1048]], <2 x double> [[SPLAT_SPLAT1050]], <2 x double> [[BLOCK1047]])
-; CHECK-NEXT:    [[BLOCK1051:%.*]] = shufflevector <2 x double> [[COL_LOAD1043]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP759:%.*]] = extractelement <2 x double> [[COL_LOAD1044]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1052:%.*]] = insertelement <2 x double> poison, double [[TMP759]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1053:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1052]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP760:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1051]], <2 x double> [[SPLAT_SPLAT1053]], <2 x double> [[TMP758]])
-; CHECK-NEXT:    [[TMP761:%.*]] = shufflevector <2 x double> [[TMP760]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP762:%.*]] = shufflevector <2 x double> [[TMP748]], <2 x double> [[TMP761]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK1054:%.*]] = shufflevector <2 x double> [[TMP754]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK1055:%.*]] = shufflevector <2 x double> [[COL_LOAD1041]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP763:%.*]] = extractelement <2 x double> [[COL_LOAD1046]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1056:%.*]] = insertelement <2 x double> poison, double [[TMP763]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1057:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1056]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP764:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1055]], <2 x double> [[SPLAT_SPLAT1057]], <2 x double> [[BLOCK1054]])
-; CHECK-NEXT:    [[BLOCK1058:%.*]] = shufflevector <2 x double> [[COL_LOAD1043]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP765:%.*]] = extractelement <2 x double> [[COL_LOAD1046]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1059:%.*]] = insertelement <2 x double> poison, double [[TMP765]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1060:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1059]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP766:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1058]], <2 x double> [[SPLAT_SPLAT1060]], <2 x double> [[TMP764]])
-; CHECK-NEXT:    [[TMP767:%.*]] = shufflevector <2 x double> [[TMP766]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP768:%.*]] = shufflevector <2 x double> [[TMP754]], <2 x double> [[TMP767]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP769:%.*]] = getelementptr double, ptr [[A]], i64 34
-; CHECK-NEXT:    [[COL_LOAD1061:%.*]] = load <2 x double>, ptr [[TMP769]], align 8
-; CHECK-NEXT:    [[VEC_GEP1062:%.*]] = getelementptr double, ptr [[TMP769]], i64 8
-; CHECK-NEXT:    [[COL_LOAD1063:%.*]] = load <2 x double>, ptr [[VEC_GEP1062]], align 8
-; CHECK-NEXT:    [[TMP770:%.*]] = getelementptr double, ptr [[B]], i64 52
-; CHECK-NEXT:    [[COL_LOAD1064:%.*]] = load <2 x double>, ptr [[TMP770]], align 8
-; CHECK-NEXT:    [[VEC_GEP1065:%.*]] = getelementptr double, ptr [[TMP770]], i64 8
-; CHECK-NEXT:    [[COL_LOAD1066:%.*]] = load <2 x double>, ptr [[VEC_GEP1065]], align 8
-; CHECK-NEXT:    [[BLOCK1067:%.*]] = shufflevector <2 x double> [[TMP762]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK1068:%.*]] = shufflevector <2 x double> [[COL_LOAD1061]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP771:%.*]] = extractelement <2 x double> [[COL_LOAD1064]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1069:%.*]] = insertelement <2 x double> poison, double [[TMP771]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1070:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1069]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP772:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1068]], <2 x double> [[SPLAT_SPLAT1070]], <2 x double> [[BLOCK1067]])
-; CHECK-NEXT:    [[BLOCK1071:%.*]] = shufflevector <2 x double> [[COL_LOAD1063]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP773:%.*]] = extractelement <2 x double> [[COL_LOAD1064]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1072:%.*]] = insertelement <2 x double> poison, double [[TMP773]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1073:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1072]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP774:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1071]], <2 x double> [[SPLAT_SPLAT1073]], <2 x double> [[TMP772]])
-; CHECK-NEXT:    [[TMP775:%.*]] = shufflevector <2 x double> [[TMP774]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP776:%.*]] = shufflevector <2 x double> [[TMP762]], <2 x double> [[TMP775]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK1074:%.*]] = shufflevector <2 x double> [[TMP768]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK1075:%.*]] = shufflevector <2 x double> [[COL_LOAD1061]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP777:%.*]] = extractelement <2 x double> [[COL_LOAD1066]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1076:%.*]] = insertelement <2 x double> poison, double [[TMP777]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1077:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1076]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP778:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1075]], <2 x double> [[SPLAT_SPLAT1077]], <2 x double> [[BLOCK1074]])
-; CHECK-NEXT:    [[BLOCK1078:%.*]] = shufflevector <2 x double> [[COL_LOAD1063]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP779:%.*]] = extractelement <2 x double> [[COL_LOAD1066]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1079:%.*]] = insertelement <2 x double> poison, double [[TMP779]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1080:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1079]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP780:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1078]], <2 x double> [[SPLAT_SPLAT1080]], <2 x double> [[TMP778]])
-; CHECK-NEXT:    [[TMP781:%.*]] = shufflevector <2 x double> [[TMP780]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP782:%.*]] = shufflevector <2 x double> [[TMP768]], <2 x double> [[TMP781]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP783:%.*]] = getelementptr double, ptr [[A]], i64 50
-; CHECK-NEXT:    [[COL_LOAD1081:%.*]] = load <2 x double>, ptr [[TMP783]], align 8
-; CHECK-NEXT:    [[VEC_GEP1082:%.*]] = getelementptr double, ptr [[TMP783]], i64 8
-; CHECK-NEXT:    [[COL_LOAD1083:%.*]] = load <2 x double>, ptr [[VEC_GEP1082]], align 8
-; CHECK-NEXT:    [[TMP784:%.*]] = getelementptr double, ptr [[B]], i64 54
-; CHECK-NEXT:    [[COL_LOAD1084:%.*]] = load <2 x double>, ptr [[TMP784]], align 8
-; CHECK-NEXT:    [[VEC_GEP1085:%.*]] = getelementptr double, ptr [[TMP784]], i64 8
-; CHECK-NEXT:    [[COL_LOAD1086:%.*]] = load <2 x double>, ptr [[VEC_GEP1085]], align 8
-; CHECK-NEXT:    [[BLOCK1087:%.*]] = shufflevector <2 x double> [[TMP776]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK1088:%.*]] = shufflevector <2 x double> [[COL_LOAD1081]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP785:%.*]] = extractelement <2 x double> [[COL_LOAD1084]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1089:%.*]] = insertelement <2 x double> poison, double [[TMP785]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1090:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1089]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP786:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1088]], <2 x double> [[SPLAT_SPLAT1090]], <2 x double> [[BLOCK1087]])
-; CHECK-NEXT:    [[BLOCK1091:%.*]] = shufflevector <2 x double> [[COL_LOAD1083]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP787:%.*]] = extractelement <2 x double> [[COL_LOAD1084]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1092:%.*]] = insertelement <2 x double> poison, double [[TMP787]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1093:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1092]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP788:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1091]], <2 x double> [[SPLAT_SPLAT1093]], <2 x double> [[TMP786]])
-; CHECK-NEXT:    [[TMP789:%.*]] = shufflevector <2 x double> [[TMP788]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP790:%.*]] = shufflevector <2 x double> [[TMP776]], <2 x double> [[TMP789]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK1094:%.*]] = shufflevector <2 x double> [[TMP782]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK1095:%.*]] = shufflevector <2 x double> [[COL_LOAD1081]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP791:%.*]] = extractelement <2 x double> [[COL_LOAD1086]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1096:%.*]] = insertelement <2 x double> poison, double [[TMP791]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1097:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1096]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP792:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1095]], <2 x double> [[SPLAT_SPLAT1097]], <2 x double> [[BLOCK1094]])
-; CHECK-NEXT:    [[BLOCK1098:%.*]] = shufflevector <2 x double> [[COL_LOAD1083]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP793:%.*]] = extractelement <2 x double> [[COL_LOAD1086]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1099:%.*]] = insertelement <2 x double> poison, double [[TMP793]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1100:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1099]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP794:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1098]], <2 x double> [[SPLAT_SPLAT1100]], <2 x double> [[TMP792]])
-; CHECK-NEXT:    [[TMP795:%.*]] = shufflevector <2 x double> [[TMP794]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP796:%.*]] = shufflevector <2 x double> [[TMP782]], <2 x double> [[TMP795]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP797:%.*]] = getelementptr double, ptr [[C]], i64 50
-; CHECK-NEXT:    store <2 x double> [[TMP790]], ptr [[TMP797]], align 8
-; CHECK-NEXT:    [[VEC_GEP1101:%.*]] = getelementptr double, ptr [[TMP797]], i64 8
-; CHECK-NEXT:    store <2 x double> [[TMP796]], ptr [[VEC_GEP1101]], align 8
-; CHECK-NEXT:    [[TMP798:%.*]] = getelementptr double, ptr [[A]], i64 4
-; CHECK-NEXT:    [[COL_LOAD1102:%.*]] = load <2 x double>, ptr [[TMP798]], align 8
-; CHECK-NEXT:    [[VEC_GEP1103:%.*]] = getelementptr double, ptr [[TMP798]], i64 8
-; CHECK-NEXT:    [[COL_LOAD1104:%.*]] = load <2 x double>, ptr [[VEC_GEP1103]], align 8
-; CHECK-NEXT:    [[TMP799:%.*]] = getelementptr double, ptr [[B]], i64 48
-; CHECK-NEXT:    [[COL_LOAD1105:%.*]] = load <2 x double>, ptr [[TMP799]], align 8
-; CHECK-NEXT:    [[VEC_GEP1106:%.*]] = getelementptr double, ptr [[TMP799]], i64 8
-; CHECK-NEXT:    [[COL_LOAD1107:%.*]] = load <2 x double>, ptr [[VEC_GEP1106]], align 8
-; CHECK-NEXT:    [[BLOCK1108:%.*]] = shufflevector <2 x double> [[COL_LOAD1102]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP800:%.*]] = extractelement <2 x double> [[COL_LOAD1105]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1109:%.*]] = insertelement <2 x double> poison, double [[TMP800]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1110:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1109]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP801:%.*]] = fmul contract <2 x double> [[BLOCK1108]], [[SPLAT_SPLAT1110]]
-; CHECK-NEXT:    [[BLOCK1111:%.*]] = shufflevector <2 x double> [[COL_LOAD1104]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP802:%.*]] = extractelement <2 x double> [[COL_LOAD1105]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1112:%.*]] = insertelement <2 x double> poison, double [[TMP802]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1113:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1112]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP803:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1111]], <2 x double> [[SPLAT_SPLAT1113]], <2 x double> [[TMP801]])
-; CHECK-NEXT:    [[TMP804:%.*]] = shufflevector <2 x double> [[TMP803]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP805:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP804]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK1114:%.*]] = shufflevector <2 x double> [[COL_LOAD1102]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP806:%.*]] = extractelement <2 x double> [[COL_LOAD1107]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1115:%.*]] = insertelement <2 x double> poison, double [[TMP806]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1116:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1115]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP807:%.*]] = fmul contract <2 x double> [[BLOCK1114]], [[SPLAT_SPLAT1116]]
-; CHECK-NEXT:    [[BLOCK1117:%.*]] = shufflevector <2 x double> [[COL_LOAD1104]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP808:%.*]] = extractelement <2 x double> [[COL_LOAD1107]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1118:%.*]] = insertelement <2 x double> poison, double [[TMP808]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1119:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1118]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP809:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1117]], <2 x double> [[SPLAT_SPLAT1119]], <2 x double> [[TMP807]])
-; CHECK-NEXT:    [[TMP810:%.*]] = shufflevector <2 x double> [[TMP809]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP811:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP810]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP812:%.*]] = getelementptr double, ptr [[A]], i64 20
-; CHECK-NEXT:    [[COL_LOAD1120:%.*]] = load <2 x double>, ptr [[TMP812]], align 8
-; CHECK-NEXT:    [[VEC_GEP1121:%.*]] = getelementptr double, ptr [[TMP812]], i64 8
-; CHECK-NEXT:    [[COL_LOAD1122:%.*]] = load <2 x double>, ptr [[VEC_GEP1121]], align 8
-; CHECK-NEXT:    [[TMP813:%.*]] = getelementptr double, ptr [[B]], i64 50
-; CHECK-NEXT:    [[COL_LOAD1123:%.*]] = load <2 x double>, ptr [[TMP813]], align 8
-; CHECK-NEXT:    [[VEC_GEP1124:%.*]] = getelementptr double, ptr [[TMP813]], i64 8
-; CHECK-NEXT:    [[COL_LOAD1125:%.*]] = load <2 x double>, ptr [[VEC_GEP1124]], align 8
-; CHECK-NEXT:    [[BLOCK1126:%.*]] = shufflevector <2 x double> [[TMP805]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK1127:%.*]] = shufflevector <2 x double> [[COL_LOAD1120]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP814:%.*]] = extractelement <2 x double> [[COL_LOAD1123]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1128:%.*]] = insertelement <2 x double> poison, double [[TMP814]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1129:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1128]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP815:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1127]], <2 x double> [[SPLAT_SPLAT1129]], <2 x double> [[BLOCK1126]])
-; CHECK-NEXT:    [[BLOCK1130:%.*]] = shufflevector <2 x double> [[COL_LOAD1122]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP816:%.*]] = extractelement <2 x double> [[COL_LOAD1123]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1131:%.*]] = insertelement <2 x double> poison, double [[TMP816]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1132:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1131]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP817:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1130]], <2 x double> [[SPLAT_SPLAT1132]], <2 x double> [[TMP815]])
-; CHECK-NEXT:    [[TMP818:%.*]] = shufflevector <2 x double> [[TMP817]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP819:%.*]] = shufflevector <2 x double> [[TMP805]], <2 x double> [[TMP818]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK1133:%.*]] = shufflevector <2 x double> [[TMP811]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK1134:%.*]] = shufflevector <2 x double> [[COL_LOAD1120]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP820:%.*]] = extractelement <2 x double> [[COL_LOAD1125]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1135:%.*]] = insertelement <2 x double> poison, double [[TMP820]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1136:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1135]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP821:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1134]], <2 x double> [[SPLAT_SPLAT1136]], <2 x double> [[BLOCK1133]])
-; CHECK-NEXT:    [[BLOCK1137:%.*]] = shufflevector <2 x double> [[COL_LOAD1122]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP822:%.*]] = extractelement <2 x double> [[COL_LOAD1125]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1138:%.*]] = insertelement <2 x double> poison, double [[TMP822]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1139:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1138]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP823:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1137]], <2 x double> [[SPLAT_SPLAT1139]], <2 x double> [[TMP821]])
-; CHECK-NEXT:    [[TMP824:%.*]] = shufflevector <2 x double> [[TMP823]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP825:%.*]] = shufflevector <2 x double> [[TMP811]], <2 x double> [[TMP824]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP826:%.*]] = getelementptr double, ptr [[A]], i64 36
-; CHECK-NEXT:    [[COL_LOAD1140:%.*]] = load <2 x double>, ptr [[TMP826]], align 8
-; CHECK-NEXT:    [[VEC_GEP1141:%.*]] = getelementptr double, ptr [[TMP826]], i64 8
-; CHECK-NEXT:    [[COL_LOAD1142:%.*]] = load <2 x double>, ptr [[VEC_GEP1141]], align 8
-; CHECK-NEXT:    [[TMP827:%.*]] = getelementptr double, ptr [[B]], i64 52
-; CHECK-NEXT:    [[COL_LOAD1143:%.*]] = load <2 x double>, ptr [[TMP827]], align 8
-; CHECK-NEXT:    [[VEC_GEP1144:%.*]] = getelementptr double, ptr [[TMP827]], i64 8
-; CHECK-NEXT:    [[COL_LOAD1145:%.*]] = load <2 x double>, ptr [[VEC_GEP1144]], align 8
-; CHECK-NEXT:    [[BLOCK1146:%.*]] = shufflevector <2 x double> [[TMP819]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK1147:%.*]] = shufflevector <2 x double> [[COL_LOAD1140]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP828:%.*]] = extractelement <2 x double> [[COL_LOAD1143]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1148:%.*]] = insertelement <2 x double> poison, double [[TMP828]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1149:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1148]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP829:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1147]], <2 x double> [[SPLAT_SPLAT1149]], <2 x double> [[BLOCK1146]])
-; CHECK-NEXT:    [[BLOCK1150:%.*]] = shufflevector <2 x double> [[COL_LOAD1142]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP830:%.*]] = extractelement <2 x double> [[COL_LOAD1143]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1151:%.*]] = insertelement <2 x double> poison, double [[TMP830]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1152:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1151]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP831:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1150]], <2 x double> [[SPLAT_SPLAT1152]], <2 x double> [[TMP829]])
-; CHECK-NEXT:    [[TMP832:%.*]] = shufflevector <2 x double> [[TMP831]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP833:%.*]] = shufflevector <2 x double> [[TMP819]], <2 x double> [[TMP832]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK1153:%.*]] = shufflevector <2 x double> [[TMP825]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK1154:%.*]] = shufflevector <2 x double> [[COL_LOAD1140]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP834:%.*]] = extractelement <2 x double> [[COL_LOAD1145]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1155:%.*]] = insertelement <2 x double> poison, double [[TMP834]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1156:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1155]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP835:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1154]], <2 x double> [[SPLAT_SPLAT1156]], <2 x double> [[BLOCK1153]])
-; CHECK-NEXT:    [[BLOCK1157:%.*]] = shufflevector <2 x double> [[COL_LOAD1142]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP836:%.*]] = extractelement <2 x double> [[COL_LOAD1145]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1158:%.*]] = insertelement <2 x double> poison, double [[TMP836]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1159:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1158]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP837:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1157]], <2 x double> [[SPLAT_SPLAT1159]], <2 x double> [[TMP835]])
-; CHECK-NEXT:    [[TMP838:%.*]] = shufflevector <2 x double> [[TMP837]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP839:%.*]] = shufflevector <2 x double> [[TMP825]], <2 x double> [[TMP838]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP840:%.*]] = getelementptr double, ptr [[A]], i64 52
-; CHECK-NEXT:    [[COL_LOAD1160:%.*]] = load <2 x double>, ptr [[TMP840]], align 8
-; CHECK-NEXT:    [[VEC_GEP1161:%.*]] = getelementptr double, ptr [[TMP840]], i64 8
-; CHECK-NEXT:    [[COL_LOAD1162:%.*]] = load <2 x double>, ptr [[VEC_GEP1161]], align 8
-; CHECK-NEXT:    [[TMP841:%.*]] = getelementptr double, ptr [[B]], i64 54
-; CHECK-NEXT:    [[COL_LOAD1163:%.*]] = load <2 x double>, ptr [[TMP841]], align 8
-; CHECK-NEXT:    [[VEC_GEP1164:%.*]] = getelementptr double, ptr [[TMP841]], i64 8
-; CHECK-NEXT:    [[COL_LOAD1165:%.*]] = load <2 x double>, ptr [[VEC_GEP1164]], align 8
-; CHECK-NEXT:    [[BLOCK1166:%.*]] = shufflevector <2 x double> [[TMP833]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK1167:%.*]] = shufflevector <2 x double> [[COL_LOAD1160]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP842:%.*]] = extractelement <2 x double> [[COL_LOAD1163]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1168:%.*]] = insertelement <2 x double> poison, double [[TMP842]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1169:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1168]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP843:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1167]], <2 x double> [[SPLAT_SPLAT1169]], <2 x double> [[BLOCK1166]])
-; CHECK-NEXT:    [[BLOCK1170:%.*]] = shufflevector <2 x double> [[COL_LOAD1162]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP844:%.*]] = extractelement <2 x double> [[COL_LOAD1163]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1171:%.*]] = insertelement <2 x double> poison, double [[TMP844]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1172:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1171]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP845:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1170]], <2 x double> [[SPLAT_SPLAT1172]], <2 x double> [[TMP843]])
-; CHECK-NEXT:    [[TMP846:%.*]] = shufflevector <2 x double> [[TMP845]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP847:%.*]] = shufflevector <2 x double> [[TMP833]], <2 x double> [[TMP846]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK1173:%.*]] = shufflevector <2 x double> [[TMP839]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK1174:%.*]] = shufflevector <2 x double> [[COL_LOAD1160]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP848:%.*]] = extractelement <2 x double> [[COL_LOAD1165]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1175:%.*]] = insertelement <2 x double> poison, double [[TMP848]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1176:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1175]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP849:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1174]], <2 x double> [[SPLAT_SPLAT1176]], <2 x double> [[BLOCK1173]])
-; CHECK-NEXT:    [[BLOCK1177:%.*]] = shufflevector <2 x double> [[COL_LOAD1162]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP850:%.*]] = extractelement <2 x double> [[COL_LOAD1165]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1178:%.*]] = insertelement <2 x double> poison, double [[TMP850]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1179:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1178]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP851:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1177]], <2 x double> [[SPLAT_SPLAT1179]], <2 x double> [[TMP849]])
-; CHECK-NEXT:    [[TMP852:%.*]] = shufflevector <2 x double> [[TMP851]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP853:%.*]] = shufflevector <2 x double> [[TMP839]], <2 x double> [[TMP852]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP854:%.*]] = getelementptr double, ptr [[C]], i64 52
-; CHECK-NEXT:    store <2 x double> [[TMP847]], ptr [[TMP854]], align 8
-; CHECK-NEXT:    [[VEC_GEP1180:%.*]] = getelementptr double, ptr [[TMP854]], i64 8
-; CHECK-NEXT:    store <2 x double> [[TMP853]], ptr [[VEC_GEP1180]], align 8
-; CHECK-NEXT:    [[TMP855:%.*]] = getelementptr double, ptr [[A]], i64 6
-; CHECK-NEXT:    [[COL_LOAD1181:%.*]] = load <2 x double>, ptr [[TMP855]], align 8
-; CHECK-NEXT:    [[VEC_GEP1182:%.*]] = getelementptr double, ptr [[TMP855]], i64 8
-; CHECK-NEXT:    [[COL_LOAD1183:%.*]] = load <2 x double>, ptr [[VEC_GEP1182]], align 8
-; CHECK-NEXT:    [[TMP856:%.*]] = getelementptr double, ptr [[B]], i64 48
-; CHECK-NEXT:    [[COL_LOAD1184:%.*]] = load <2 x double>, ptr [[TMP856]], align 8
-; CHECK-NEXT:    [[VEC_GEP1185:%.*]] = getelementptr double, ptr [[TMP856]], i64 8
-; CHECK-NEXT:    [[COL_LOAD1186:%.*]] = load <2 x double>, ptr [[VEC_GEP1185]], align 8
-; CHECK-NEXT:    [[BLOCK1187:%.*]] = shufflevector <2 x double> [[COL_LOAD1181]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP857:%.*]] = extractelement <2 x double> [[COL_LOAD1184]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1188:%.*]] = insertelement <2 x double> poison, double [[TMP857]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1189:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1188]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP858:%.*]] = fmul contract <2 x double> [[BLOCK1187]], [[SPLAT_SPLAT1189]]
-; CHECK-NEXT:    [[BLOCK1190:%.*]] = shufflevector <2 x double> [[COL_LOAD1183]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP859:%.*]] = extractelement <2 x double> [[COL_LOAD1184]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1191:%.*]] = insertelement <2 x double> poison, double [[TMP859]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1192:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1191]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP860:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1190]], <2 x double> [[SPLAT_SPLAT1192]], <2 x double> [[TMP858]])
-; CHECK-NEXT:    [[TMP861:%.*]] = shufflevector <2 x double> [[TMP860]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP862:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP861]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK1193:%.*]] = shufflevector <2 x double> [[COL_LOAD1181]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP863:%.*]] = extractelement <2 x double> [[COL_LOAD1186]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1194:%.*]] = insertelement <2 x double> poison, double [[TMP863]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1195:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1194]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP864:%.*]] = fmul contract <2 x double> [[BLOCK1193]], [[SPLAT_SPLAT1195]]
-; CHECK-NEXT:    [[BLOCK1196:%.*]] = shufflevector <2 x double> [[COL_LOAD1183]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP865:%.*]] = extractelement <2 x double> [[COL_LOAD1186]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1197:%.*]] = insertelement <2 x double> poison, double [[TMP865]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1198:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1197]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP866:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1196]], <2 x double> [[SPLAT_SPLAT1198]], <2 x double> [[TMP864]])
-; CHECK-NEXT:    [[TMP867:%.*]] = shufflevector <2 x double> [[TMP866]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP868:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP867]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP869:%.*]] = getelementptr double, ptr [[A]], i64 22
-; CHECK-NEXT:    [[COL_LOAD1199:%.*]] = load <2 x double>, ptr [[TMP869]], align 8
-; CHECK-NEXT:    [[VEC_GEP1200:%.*]] = getelementptr double, ptr [[TMP869]], i64 8
-; CHECK-NEXT:    [[COL_LOAD1201:%.*]] = load <2 x double>, ptr [[VEC_GEP1200]], align 8
-; CHECK-NEXT:    [[TMP870:%.*]] = getelementptr double, ptr [[B]], i64 50
-; CHECK-NEXT:    [[COL_LOAD1202:%.*]] = load <2 x double>, ptr [[TMP870]], align 8
-; CHECK-NEXT:    [[VEC_GEP1203:%.*]] = getelementptr double, ptr [[TMP870]], i64 8
-; CHECK-NEXT:    [[COL_LOAD1204:%.*]] = load <2 x double>, ptr [[VEC_GEP1203]], align 8
-; CHECK-NEXT:    [[BLOCK1205:%.*]] = shufflevector <2 x double> [[TMP862]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK1206:%.*]] = shufflevector <2 x double> [[COL_LOAD1199]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP871:%.*]] = extractelement <2 x double> [[COL_LOAD1202]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1207:%.*]] = insertelement <2 x double> poison, double [[TMP871]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1208:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1207]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP872:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1206]], <2 x double> [[SPLAT_SPLAT1208]], <2 x double> [[BLOCK1205]])
-; CHECK-NEXT:    [[BLOCK1209:%.*]] = shufflevector <2 x double> [[COL_LOAD1201]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP873:%.*]] = extractelement <2 x double> [[COL_LOAD1202]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1210:%.*]] = insertelement <2 x double> poison, double [[TMP873]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1211:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1210]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP874:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1209]], <2 x double> [[SPLAT_SPLAT1211]], <2 x double> [[TMP872]])
-; CHECK-NEXT:    [[TMP875:%.*]] = shufflevector <2 x double> [[TMP874]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP876:%.*]] = shufflevector <2 x double> [[TMP862]], <2 x double> [[TMP875]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK1212:%.*]] = shufflevector <2 x double> [[TMP868]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK1213:%.*]] = shufflevector <2 x double> [[COL_LOAD1199]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP877:%.*]] = extractelement <2 x double> [[COL_LOAD1204]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1214:%.*]] = insertelement <2 x double> poison, double [[TMP877]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1215:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1214]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP878:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1213]], <2 x double> [[SPLAT_SPLAT1215]], <2 x double> [[BLOCK1212]])
-; CHECK-NEXT:    [[BLOCK1216:%.*]] = shufflevector <2 x double> [[COL_LOAD1201]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP879:%.*]] = extractelement <2 x double> [[COL_LOAD1204]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1217:%.*]] = insertelement <2 x double> poison, double [[TMP879]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1218:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1217]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP880:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1216]], <2 x double> [[SPLAT_SPLAT1218]], <2 x double> [[TMP878]])
-; CHECK-NEXT:    [[TMP881:%.*]] = shufflevector <2 x double> [[TMP880]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP882:%.*]] = shufflevector <2 x double> [[TMP868]], <2 x double> [[TMP881]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP883:%.*]] = getelementptr double, ptr [[A]], i64 38
-; CHECK-NEXT:    [[COL_LOAD1219:%.*]] = load <2 x double>, ptr [[TMP883]], align 8
-; CHECK-NEXT:    [[VEC_GEP1220:%.*]] = getelementptr double, ptr [[TMP883]], i64 8
-; CHECK-NEXT:    [[COL_LOAD1221:%.*]] = load <2 x double>, ptr [[VEC_GEP1220]], align 8
-; CHECK-NEXT:    [[TMP884:%.*]] = getelementptr double, ptr [[B]], i64 52
-; CHECK-NEXT:    [[COL_LOAD1222:%.*]] = load <2 x double>, ptr [[TMP884]], align 8
-; CHECK-NEXT:    [[VEC_GEP1223:%.*]] = getelementptr double, ptr [[TMP884]], i64 8
-; CHECK-NEXT:    [[COL_LOAD1224:%.*]] = load <2 x double>, ptr [[VEC_GEP1223]], align 8
-; CHECK-NEXT:    [[BLOCK1225:%.*]] = shufflevector <2 x double> [[TMP876]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK1226:%.*]] = shufflevector <2 x double> [[COL_LOAD1219]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP885:%.*]] = extractelement <2 x double> [[COL_LOAD1222]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1227:%.*]] = insertelement <2 x double> poison, double [[TMP885]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1228:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1227]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP886:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1226]], <2 x double> [[SPLAT_SPLAT1228]], <2 x double> [[BLOCK1225]])
-; CHECK-NEXT:    [[BLOCK1229:%.*]] = shufflevector <2 x double> [[COL_LOAD1221]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP887:%.*]] = extractelement <2 x double> [[COL_LOAD1222]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1230:%.*]] = insertelement <2 x double> poison, double [[TMP887]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1231:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1230]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP888:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1229]], <2 x double> [[SPLAT_SPLAT1231]], <2 x double> [[TMP886]])
-; CHECK-NEXT:    [[TMP889:%.*]] = shufflevector <2 x double> [[TMP888]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP890:%.*]] = shufflevector <2 x double> [[TMP876]], <2 x double> [[TMP889]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK1232:%.*]] = shufflevector <2 x double> [[TMP882]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK1233:%.*]] = shufflevector <2 x double> [[COL_LOAD1219]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP891:%.*]] = extractelement <2 x double> [[COL_LOAD1224]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1234:%.*]] = insertelement <2 x double> poison, double [[TMP891]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1235:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1234]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP892:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1233]], <2 x double> [[SPLAT_SPLAT1235]], <2 x double> [[BLOCK1232]])
-; CHECK-NEXT:    [[BLOCK1236:%.*]] = shufflevector <2 x double> [[COL_LOAD1221]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP893:%.*]] = extractelement <2 x double> [[COL_LOAD1224]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1237:%.*]] = insertelement <2 x double> poison, double [[TMP893]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1238:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1237]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP894:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1236]], <2 x double> [[SPLAT_SPLAT1238]], <2 x double> [[TMP892]])
-; CHECK-NEXT:    [[TMP895:%.*]] = shufflevector <2 x double> [[TMP894]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP896:%.*]] = shufflevector <2 x double> [[TMP882]], <2 x double> [[TMP895]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP897:%.*]] = getelementptr double, ptr [[A]], i64 54
-; CHECK-NEXT:    [[COL_LOAD1239:%.*]] = load <2 x double>, ptr [[TMP897]], align 8
-; CHECK-NEXT:    [[VEC_GEP1240:%.*]] = getelementptr double, ptr [[TMP897]], i64 8
-; CHECK-NEXT:    [[COL_LOAD1241:%.*]] = load <2 x double>, ptr [[VEC_GEP1240]], align 8
-; CHECK-NEXT:    [[TMP898:%.*]] = getelementptr double, ptr [[B]], i64 54
-; CHECK-NEXT:    [[COL_LOAD1242:%.*]] = load <2 x double>, ptr [[TMP898]], align 8
-; CHECK-NEXT:    [[VEC_GEP1243:%.*]] = getelementptr double, ptr [[TMP898]], i64 8
-; CHECK-NEXT:    [[COL_LOAD1244:%.*]] = load <2 x double>, ptr [[VEC_GEP1243]], align 8
-; CHECK-NEXT:    [[BLOCK1245:%.*]] = shufflevector <2 x double> [[TMP890]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK1246:%.*]] = shufflevector <2 x double> [[COL_LOAD1239]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP899:%.*]] = extractelement <2 x double> [[COL_LOAD1242]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1247:%.*]] = insertelement <2 x double> poison, double [[TMP899]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1248:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1247]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP900:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1246]], <2 x double> [[SPLAT_SPLAT1248]], <2 x double> [[BLOCK1245]])
-; CHECK-NEXT:    [[BLOCK1249:%.*]] = shufflevector <2 x double> [[COL_LOAD1241]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP901:%.*]] = extractelement <2 x double> [[COL_LOAD1242]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1250:%.*]] = insertelement <2 x double> poison, double [[TMP901]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1251:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1250]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP902:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1249]], <2 x double> [[SPLAT_SPLAT1251]], <2 x double> [[TMP900]])
-; CHECK-NEXT:    [[TMP903:%.*]] = shufflevector <2 x double> [[TMP902]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP904:%.*]] = shufflevector <2 x double> [[TMP890]], <2 x double> [[TMP903]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[BLOCK1252:%.*]] = shufflevector <2 x double> [[TMP896]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[BLOCK1253:%.*]] = shufflevector <2 x double> [[COL_LOAD1239]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP905:%.*]] = extractelement <2 x double> [[COL_LOAD1244]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1254:%.*]] = insertelement <2 x double> poison, double [[TMP905]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1255:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1254]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP906:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1253]], <2 x double> [[SPLAT_SPLAT1255]], <2 x double> [[BLOCK1252]])
-; CHECK-NEXT:    [[BLOCK1256:%.*]] = shufflevector <2 x double> [[COL_LOAD1241]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP907:%.*]] = extractelement <2 x double> [[COL_LOAD1244]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT1257:%.*]] = insertelement <2 x double> poison, double [[TMP907]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT1258:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1257]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP908:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1256]], <2 x double> [[SPLAT_SPLAT1258]], <2 x double> [[TMP906]])
-; CHECK-NEXT:    [[TMP909:%.*]] = shufflevector <2 x double> [[TMP908]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP910:%.*]] = shufflevector <2 x double> [[TMP896]], <2 x double> [[TMP909]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP911:%.*]] = getelementptr double, ptr [[C]], i64 54
-; CHECK-NEXT:    store <2 x double> [[TMP904]], ptr [[TMP911]], align 8
-; CHECK-NEXT:    [[VEC_GEP1259:%.*]] = getelementptr double, ptr [[TMP911]], i64 8
-; CHECK-NEXT:    store <2 x double> [[TMP910]], ptr [[VEC_GEP1259]], align 8
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[COLS_HEADER:.*]]
+; CHECK:       [[COLS_HEADER]]:
+; CHECK-NEXT:    [[COLS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[COLS_STEP:%.*]], %[[COLS_LATCH:.*]] ]
+; CHECK-NEXT:    br label %[[COLS_BODY:.*]]
+; CHECK:       [[COLS_BODY]]:
+; CHECK-NEXT:    br label %[[ROWS_HEADER:.*]]
+; CHECK:       [[ROWS_HEADER]]:
+; CHECK-NEXT:    [[ROWS_IV:%.*]] = phi i64 [ 0, %[[COLS_BODY]] ], [ [[ROWS_STEP:%.*]], %[[ROWS_LATCH:.*]] ]
+; CHECK-NEXT:    br label %[[ROWS_BODY:.*]]
+; CHECK:       [[ROWS_BODY]]:
+; CHECK-NEXT:    br label %[[INNER_HEADER:.*]]
+; CHECK:       [[INNER_HEADER]]:
+; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i64 [ 0, %[[ROWS_BODY]] ], [ [[INNER_STEP:%.*]], %[[INNER_LATCH:.*]] ]
+; CHECK-NEXT:    [[TMP912:%.*]] = phi <2 x double> [ zeroinitializer, %[[ROWS_BODY]] ], [ [[TMP921:%.*]], %[[INNER_LATCH]] ]
+; CHECK-NEXT:    [[TMP913:%.*]] = phi <2 x double> [ zeroinitializer, %[[ROWS_BODY]] ], [ [[TMP927:%.*]], %[[INNER_LATCH]] ]
+; CHECK-NEXT:    br label %[[INNER_BODY:.*]]
+; CHECK:       [[INNER_BODY]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[INNER_IV]], 8
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[TMP0]], [[ROWS_IV]]
+; CHECK-NEXT:    [[TMP914:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[COL_LOAD1240:%.*]] = load <2 x double>, ptr [[TMP914]], align 8
+; CHECK-NEXT:    [[VEC_GEP1241:%.*]] = getelementptr double, ptr [[TMP914]], i64 8
+; CHECK-NEXT:    [[COL_LOAD1243:%.*]] = load <2 x double>, ptr [[VEC_GEP1241]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[COLS_IV]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], [[INNER_IV]]
+; CHECK-NEXT:    [[TMP915:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP4]]
+; CHECK-NEXT:    [[COL_LOAD1245:%.*]] = load <2 x double>, ptr [[TMP915]], align 8
+; CHECK-NEXT:    [[VEC_GEP1244:%.*]] = getelementptr double, ptr [[TMP915]], i64 8
+; CHECK-NEXT:    [[COL_LOAD1246:%.*]] = load <2 x double>, ptr [[VEC_GEP1244]], align 8
+; CHECK-NEXT:    [[BLOCK1247:%.*]] = shufflevector <2 x double> [[TMP912]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[BLOCK1248:%.*]] = shufflevector <2 x double> [[COL_LOAD1240]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP916:%.*]] = extractelement <2 x double> [[COL_LOAD1245]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT1248:%.*]] = insertelement <2 x double> poison, double [[TMP916]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT1249:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1248]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP917:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1248]], <2 x double> [[SPLAT_SPLAT1249]], <2 x double> [[BLOCK1247]])
+; CHECK-NEXT:    [[BLOCK1250:%.*]] = shufflevector <2 x double> [[COL_LOAD1243]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP918:%.*]] = extractelement <2 x double> [[COL_LOAD1245]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT1251:%.*]] = insertelement <2 x double> poison, double [[TMP918]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT1252:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1251]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP919:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1250]], <2 x double> [[SPLAT_SPLAT1252]], <2 x double> [[TMP917]])
+; CHECK-NEXT:    [[TMP920:%.*]] = shufflevector <2 x double> [[TMP919]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP921]] = shufflevector <2 x double> [[TMP912]], <2 x double> [[TMP920]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[BLOCK1254:%.*]] = shufflevector <2 x double> [[TMP913]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[BLOCK1255:%.*]] = shufflevector <2 x double> [[COL_LOAD1240]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP922:%.*]] = extractelement <2 x double> [[COL_LOAD1246]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT1255:%.*]] = insertelement <2 x double> poison, double [[TMP922]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT1256:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1255]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP923:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1255]], <2 x double> [[SPLAT_SPLAT1256]], <2 x double> [[BLOCK1254]])
+; CHECK-NEXT:    [[BLOCK1257:%.*]] = shufflevector <2 x double> [[COL_LOAD1243]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP924:%.*]] = extractelement <2 x double> [[COL_LOAD1246]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT1258:%.*]] = insertelement <2 x double> poison, double [[TMP924]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT1259:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1258]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP925:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1257]], <2 x double> [[SPLAT_SPLAT1259]], <2 x double> [[TMP923]])
+; CHECK-NEXT:    [[TMP926:%.*]] = shufflevector <2 x double> [[TMP925]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP927]] = shufflevector <2 x double> [[TMP913]], <2 x double> [[TMP926]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    br label %[[INNER_LATCH]]
+; CHECK:       [[INNER_LATCH]]:
+; CHECK-NEXT:    [[INNER_STEP]] = add i64 [[INNER_IV]], 2
+; CHECK-NEXT:    [[INNER_COND:%.*]] = icmp ne i64 [[INNER_STEP]], 8
+; CHECK-NEXT:    br i1 [[INNER_COND]], label %[[INNER_HEADER]], label %[[ROWS_LATCH]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[ROWS_LATCH]]:
+; CHECK-NEXT:    [[ROWS_STEP]] = add i64 [[ROWS_IV]], 2
+; CHECK-NEXT:    [[ROWS_COND:%.*]] = icmp ne i64 [[ROWS_STEP]], 8
+; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[COLS_IV]], 8
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[TMP18]], [[ROWS_IV]]
+; CHECK-NEXT:    [[TMP928:%.*]] = getelementptr double, ptr [[C]], i64 [[TMP19]]
+; CHECK-NEXT:    store <2 x double> [[TMP921]], ptr [[TMP928]], align 8
+; CHECK-NEXT:    [[VEC_GEP1260:%.*]] = getelementptr double, ptr [[TMP928]], i64 8
+; CHECK-NEXT:    store <2 x double> [[TMP927]], ptr [[VEC_GEP1260]], align 8
+; CHECK-NEXT:    br i1 [[ROWS_COND]], label %[[ROWS_HEADER]], label %[[COLS_LATCH]]
+; CHECK:       [[COLS_LATCH]]:
+; CHECK-NEXT:    [[COLS_STEP]] = add i64 [[COLS_IV]], 2
+; CHECK-NEXT:    [[COLS_COND:%.*]] = icmp ne i64 [[COLS_STEP]], 8
+; CHECK-NEXT:    br i1 [[COLS_COND]], label %[[COLS_HEADER]], label %[[CONTINUE:.*]]
+; CHECK:       [[CONTINUE]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -3177,3 +1044,7 @@ entry:
   store <64 x double> %c, ptr %C, align 8
   ret void
 }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.unroll.count", i32 4}
+;.

diff  --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll
index 8c2cc8e799bcd..77da175b7478b 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=lower-matrix-intrinsics,instcombine -fuse-matrix-use-loops -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s
+; RUN: opt -passes=lower-matrix-intrinsics,instcombine -fuse-matrix-loops-threshold=0 -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s
 
 ; REQUIRES: aarch64-registered-target
 

diff  --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-volatile.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-volatile.ll
index fb1925d48bb96..0852940cd226e 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-volatile.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-volatile.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=lower-matrix-intrinsics -fuse-matrix-use-loops -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s
+; RUN: opt -passes=lower-matrix-intrinsics -fuse-matrix-loops-threshold=0 -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s
 
 ; REQUIRES: aarch64-registered-target
 

diff  --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
index 155f7755c2095..430358f0a5138 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=lower-matrix-intrinsics,instcombine -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s
+; RUN: opt -passes=lower-matrix-intrinsics,instcombine -fuse-matrix-loops-threshold=9999 -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s
 
 ; REQUIRES: aarch64-registered-target
 


        


More information about the llvm-commits mailing list