[llvm] 54177e9 - [Matrix] Use tiled loops automatically for large kernels. (#179325)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 11 07:36:40 PST 2026
Author: Florian Hahn
Date: 2026-02-11T15:36:34Z
New Revision: 54177e95d1956aa040503d683762d4bfa3ff954b
URL: https://github.com/llvm/llvm-project/commit/54177e95d1956aa040503d683762d4bfa3ff954b
DIFF: https://github.com/llvm/llvm-project/commit/54177e95d1956aa040503d683762d4bfa3ff954b.diff
LOG: [Matrix] Use tiled loops automatically for large kernels. (#179325)
Update LowerMatrixIntrinsics to use tiled loops automatically in for
larger matrixes. The fully unrolled codegen creates a huge amount of
code, which performs noticably worse then the tiled loop nest variant.
We new try to estimate the number of instructions needed for the
multiply, and if it is too large, tiled loops are used. The current
threshold is anything roughly larger than 6x6x6 double multiply.
Eventually I think we want to only generate tiled loops. This patch is a
first step, trying to opt in for cases where we know it is beneficial.
Checked on AArch64, but should help on other architectures similarly,
and also drastically reduce binary size + compile time.
PR: https://github.com/llvm/llvm-project/pull/179325
Added:
Modified:
llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll
llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops-large-matrixes.ll
llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll
llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-volatile.ll
llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 5d558c2f7a341..ecf295dc75c3a 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -68,9 +68,10 @@ static cl::opt<unsigned> TileSize(
"fuse-matrix-tile-size", cl::init(4), cl::Hidden,
cl::desc(
"Tile size for matrix instruction fusion using square-shaped tiles."));
-static cl::opt<bool> TileUseLoops("fuse-matrix-use-loops", cl::init(false),
- cl::Hidden,
- cl::desc("Generate loop nest for tiling."));
+static cl::opt<unsigned>
+ TileLoopsThreshold("fuse-matrix-loops-threshold", cl::init(200), cl::Hidden,
+ cl::desc("Generate loop nests for tiling when expected "
+ "number of operations exceeds threshold."));
static cl::opt<bool> ForceFusion(
"force-fuse-matrix", cl::init(false), cl::Hidden,
cl::desc("Force matrix instruction fusion even if not profitable."));
@@ -612,6 +613,24 @@ class LowerMatrixIntrinsics {
.getFixedValue()));
}
+ /// Estimate the number of native vector operations for a multiply of matrices
+ /// with dimensions \p R x \p M and \p M x \p C. Native ops are computed as
+ /// ceil(ElementCount * ElementBits / RegisterBits).
+ ///
+ /// Native vector ops per operation type (VF = native vector elements):
+ /// FMAs: C * ceil(R/VF) * M (one FMA per VF output elements)
+ /// A loads: ceil(R/VF) * M (A has M columns, ceil(R/VF) native loads each)
+ /// B loads: ceil(M/VF) * C (B has C columns, ceil(M/VF) native loads each)
+ /// Stores: C * ceil(R/VF) (one store per VF output elements)
+ unsigned getNumNativeVectorOps(Type *EltType, unsigned R, unsigned M,
+ unsigned C) {
+ unsigned NumFMAs = C * getNumOps(EltType, R) * M;
+ unsigned NumALoads = getNumOps(EltType, R) * M;
+ unsigned NumBLoads = getNumOps(EltType, M) * C;
+ unsigned NumStores = getNumOps(EltType, R) * C;
+ return NumFMAs + NumALoads + NumBLoads + NumStores;
+ }
+
/// Return the set of vectors that a matrix value is lowered to.
///
/// If we lowered \p MatrixVal, just return the cache result matrix. Otherwise
@@ -2057,7 +2076,12 @@ class LowerMatrixIntrinsics {
Value *BPtr = getNonAliasingPointer(LoadOp1, Store, MatMul);
Value *CPtr = Store->getPointerOperand();
- if (TileUseLoops && (R % TileSize == 0 && C % TileSize == 0))
+ // Use loop-based tiling when the number of expected operations exceeds
+ // threshold.
+ unsigned NumOps = getNumNativeVectorOps(EltType, R, M, C);
+ bool UseLoops =
+ (NumOps > TileLoopsThreshold) && R % TileSize == 0 && C % TileSize == 0;
+ if (UseLoops)
createTiledLoops(MatMul, APtr, LShape, BPtr, RShape, Store);
else {
IRBuilder<> Builder(Store);
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
index abd1d96937b28..12a833bca521d 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:64:64' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR64
-; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:32:32' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR32
+; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:64:64' -fuse-matrix-loops-threshold=9999 -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR64
+; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:32:32' -fuse-matrix-loops-threshold=9999 -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR32
; REQUIRES: aarch64-registered-target
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll
index a6308c5a97333..ee42b10a03fbc 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=lower-matrix-intrinsics,instcombine -fuse-matrix-use-loops=false -fuse-matrix-tile-size=1 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s
+; RUN: opt -passes=lower-matrix-intrinsics,instcombine -fuse-matrix-loops-threshold=999999 -fuse-matrix-tile-size=1 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s
; REQUIRES: aarch64-registered-target
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops-large-matrixes.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops-large-matrixes.ll
index ae10f20dd6e8a..868dfec92209a 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops-large-matrixes.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops-large-matrixes.ll
@@ -959,2215 +959,82 @@ entry:
define void @multiply_8x8x8(ptr noalias %A, ptr noalias %B, ptr noalias %C) {
; CHECK-LABEL: define void @multiply_8x8x8(
; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
-; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
-; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 8
-; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0
-; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
-; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 8
-; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
-; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = fmul contract <2 x double> [[BLOCK]], [[SPLAT_SPLAT]]
-; CHECK-NEXT: [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT6]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP5:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK5]], <2 x double> [[SPLAT_SPLAT7]], <2 x double> [[TMP3]])
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP6]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK8:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT9:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT9]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP9:%.*]] = fmul contract <2 x double> [[BLOCK8]], [[SPLAT_SPLAT10]]
-; CHECK-NEXT: [[BLOCK11:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT12:%.*]] = insertelement <2 x double> poison, double [[TMP10]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT13:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT12]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK11]], <2 x double> [[SPLAT_SPLAT13]], <2 x double> [[TMP9]])
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP12]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr double, ptr [[A]], i64 16
-; CHECK-NEXT: [[COL_LOAD14:%.*]] = load <2 x double>, ptr [[TMP14]], align 8
-; CHECK-NEXT: [[VEC_GEP15:%.*]] = getelementptr double, ptr [[TMP14]], i64 8
-; CHECK-NEXT: [[COL_LOAD16:%.*]] = load <2 x double>, ptr [[VEC_GEP15]], align 8
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[B]], i64 2
-; CHECK-NEXT: [[COL_LOAD17:%.*]] = load <2 x double>, ptr [[TMP15]], align 8
-; CHECK-NEXT: [[VEC_GEP18:%.*]] = getelementptr double, ptr [[TMP15]], i64 8
-; CHECK-NEXT: [[COL_LOAD19:%.*]] = load <2 x double>, ptr [[VEC_GEP18]], align 8
-; CHECK-NEXT: [[BLOCK20:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK21:%.*]] = shufflevector <2 x double> [[COL_LOAD14]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD17]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT22:%.*]] = insertelement <2 x double> poison, double [[TMP16]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT23:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT22]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK21]], <2 x double> [[SPLAT_SPLAT23]], <2 x double> [[BLOCK20]])
-; CHECK-NEXT: [[BLOCK24:%.*]] = shufflevector <2 x double> [[COL_LOAD16]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x double> [[COL_LOAD17]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT25:%.*]] = insertelement <2 x double> poison, double [[TMP18]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT26:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT25]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP19:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK24]], <2 x double> [[SPLAT_SPLAT26]], <2 x double> [[TMP17]])
-; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP20]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK27:%.*]] = shufflevector <2 x double> [[TMP13]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK28:%.*]] = shufflevector <2 x double> [[COL_LOAD14]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[COL_LOAD19]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT29:%.*]] = insertelement <2 x double> poison, double [[TMP22]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT29]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP23:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK28]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[BLOCK27]])
-; CHECK-NEXT: [[BLOCK31:%.*]] = shufflevector <2 x double> [[COL_LOAD16]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x double> [[COL_LOAD19]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT32:%.*]] = insertelement <2 x double> poison, double [[TMP24]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT32]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK31]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP23]])
-; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <2 x double> [[TMP25]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <2 x double> [[TMP13]], <2 x double> [[TMP26]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP28:%.*]] = getelementptr double, ptr [[A]], i64 32
-; CHECK-NEXT: [[COL_LOAD34:%.*]] = load <2 x double>, ptr [[TMP28]], align 8
-; CHECK-NEXT: [[VEC_GEP35:%.*]] = getelementptr double, ptr [[TMP28]], i64 8
-; CHECK-NEXT: [[COL_LOAD36:%.*]] = load <2 x double>, ptr [[VEC_GEP35]], align 8
-; CHECK-NEXT: [[TMP29:%.*]] = getelementptr double, ptr [[B]], i64 4
-; CHECK-NEXT: [[COL_LOAD37:%.*]] = load <2 x double>, ptr [[TMP29]], align 8
-; CHECK-NEXT: [[VEC_GEP38:%.*]] = getelementptr double, ptr [[TMP29]], i64 8
-; CHECK-NEXT: [[COL_LOAD39:%.*]] = load <2 x double>, ptr [[VEC_GEP38]], align 8
-; CHECK-NEXT: [[BLOCK40:%.*]] = shufflevector <2 x double> [[TMP21]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK41:%.*]] = shufflevector <2 x double> [[COL_LOAD34]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP30:%.*]] = extractelement <2 x double> [[COL_LOAD37]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT42:%.*]] = insertelement <2 x double> poison, double [[TMP30]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT43:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT42]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP31:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK41]], <2 x double> [[SPLAT_SPLAT43]], <2 x double> [[BLOCK40]])
-; CHECK-NEXT: [[BLOCK44:%.*]] = shufflevector <2 x double> [[COL_LOAD36]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x double> [[COL_LOAD37]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT45:%.*]] = insertelement <2 x double> poison, double [[TMP32]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT46:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT45]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP33:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK44]], <2 x double> [[SPLAT_SPLAT46]], <2 x double> [[TMP31]])
-; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <2 x double> [[TMP33]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <2 x double> [[TMP21]], <2 x double> [[TMP34]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK47:%.*]] = shufflevector <2 x double> [[TMP27]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK48:%.*]] = shufflevector <2 x double> [[COL_LOAD34]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP36:%.*]] = extractelement <2 x double> [[COL_LOAD39]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT49:%.*]] = insertelement <2 x double> poison, double [[TMP36]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT49]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK48]], <2 x double> [[SPLAT_SPLAT50]], <2 x double> [[BLOCK47]])
-; CHECK-NEXT: [[BLOCK51:%.*]] = shufflevector <2 x double> [[COL_LOAD36]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP38:%.*]] = extractelement <2 x double> [[COL_LOAD39]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT52:%.*]] = insertelement <2 x double> poison, double [[TMP38]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT53:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT52]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK51]], <2 x double> [[SPLAT_SPLAT53]], <2 x double> [[TMP37]])
-; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <2 x double> [[TMP39]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <2 x double> [[TMP27]], <2 x double> [[TMP40]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP42:%.*]] = getelementptr double, ptr [[A]], i64 48
-; CHECK-NEXT: [[COL_LOAD54:%.*]] = load <2 x double>, ptr [[TMP42]], align 8
-; CHECK-NEXT: [[VEC_GEP55:%.*]] = getelementptr double, ptr [[TMP42]], i64 8
-; CHECK-NEXT: [[COL_LOAD56:%.*]] = load <2 x double>, ptr [[VEC_GEP55]], align 8
-; CHECK-NEXT: [[TMP43:%.*]] = getelementptr double, ptr [[B]], i64 6
-; CHECK-NEXT: [[COL_LOAD57:%.*]] = load <2 x double>, ptr [[TMP43]], align 8
-; CHECK-NEXT: [[VEC_GEP58:%.*]] = getelementptr double, ptr [[TMP43]], i64 8
-; CHECK-NEXT: [[COL_LOAD59:%.*]] = load <2 x double>, ptr [[VEC_GEP58]], align 8
-; CHECK-NEXT: [[BLOCK60:%.*]] = shufflevector <2 x double> [[TMP35]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK61:%.*]] = shufflevector <2 x double> [[COL_LOAD54]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP44:%.*]] = extractelement <2 x double> [[COL_LOAD57]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT62:%.*]] = insertelement <2 x double> poison, double [[TMP44]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT63:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT62]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP45:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK61]], <2 x double> [[SPLAT_SPLAT63]], <2 x double> [[BLOCK60]])
-; CHECK-NEXT: [[BLOCK64:%.*]] = shufflevector <2 x double> [[COL_LOAD56]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP46:%.*]] = extractelement <2 x double> [[COL_LOAD57]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT65:%.*]] = insertelement <2 x double> poison, double [[TMP46]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT66:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT65]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK64]], <2 x double> [[SPLAT_SPLAT66]], <2 x double> [[TMP45]])
-; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <2 x double> [[TMP47]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP49:%.*]] = shufflevector <2 x double> [[TMP35]], <2 x double> [[TMP48]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK67:%.*]] = shufflevector <2 x double> [[TMP41]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK68:%.*]] = shufflevector <2 x double> [[COL_LOAD54]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP50:%.*]] = extractelement <2 x double> [[COL_LOAD59]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT69:%.*]] = insertelement <2 x double> poison, double [[TMP50]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT70:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT69]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP51:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK68]], <2 x double> [[SPLAT_SPLAT70]], <2 x double> [[BLOCK67]])
-; CHECK-NEXT: [[BLOCK71:%.*]] = shufflevector <2 x double> [[COL_LOAD56]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP52:%.*]] = extractelement <2 x double> [[COL_LOAD59]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT72:%.*]] = insertelement <2 x double> poison, double [[TMP52]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT73:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT72]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP53:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK71]], <2 x double> [[SPLAT_SPLAT73]], <2 x double> [[TMP51]])
-; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <2 x double> [[TMP53]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <2 x double> [[TMP41]], <2 x double> [[TMP54]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP56:%.*]] = getelementptr double, ptr [[C]], i64 0
-; CHECK-NEXT: store <2 x double> [[TMP49]], ptr [[TMP56]], align 8
-; CHECK-NEXT: [[VEC_GEP74:%.*]] = getelementptr double, ptr [[TMP56]], i64 8
-; CHECK-NEXT: store <2 x double> [[TMP55]], ptr [[VEC_GEP74]], align 8
-; CHECK-NEXT: [[TMP57:%.*]] = getelementptr double, ptr [[A]], i64 2
-; CHECK-NEXT: [[COL_LOAD75:%.*]] = load <2 x double>, ptr [[TMP57]], align 8
-; CHECK-NEXT: [[VEC_GEP76:%.*]] = getelementptr double, ptr [[TMP57]], i64 8
-; CHECK-NEXT: [[COL_LOAD77:%.*]] = load <2 x double>, ptr [[VEC_GEP76]], align 8
-; CHECK-NEXT: [[TMP58:%.*]] = getelementptr double, ptr [[B]], i64 0
-; CHECK-NEXT: [[COL_LOAD78:%.*]] = load <2 x double>, ptr [[TMP58]], align 8
-; CHECK-NEXT: [[VEC_GEP79:%.*]] = getelementptr double, ptr [[TMP58]], i64 8
-; CHECK-NEXT: [[COL_LOAD80:%.*]] = load <2 x double>, ptr [[VEC_GEP79]], align 8
-; CHECK-NEXT: [[BLOCK81:%.*]] = shufflevector <2 x double> [[COL_LOAD75]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP59:%.*]] = extractelement <2 x double> [[COL_LOAD78]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT82:%.*]] = insertelement <2 x double> poison, double [[TMP59]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT83:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT82]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP60:%.*]] = fmul contract <2 x double> [[BLOCK81]], [[SPLAT_SPLAT83]]
-; CHECK-NEXT: [[BLOCK84:%.*]] = shufflevector <2 x double> [[COL_LOAD77]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP61:%.*]] = extractelement <2 x double> [[COL_LOAD78]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT85:%.*]] = insertelement <2 x double> poison, double [[TMP61]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT86:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT85]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP62:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK84]], <2 x double> [[SPLAT_SPLAT86]], <2 x double> [[TMP60]])
-; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <2 x double> [[TMP62]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP63]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK87:%.*]] = shufflevector <2 x double> [[COL_LOAD75]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP65:%.*]] = extractelement <2 x double> [[COL_LOAD80]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT88:%.*]] = insertelement <2 x double> poison, double [[TMP65]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT89:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT88]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP66:%.*]] = fmul contract <2 x double> [[BLOCK87]], [[SPLAT_SPLAT89]]
-; CHECK-NEXT: [[BLOCK90:%.*]] = shufflevector <2 x double> [[COL_LOAD77]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP67:%.*]] = extractelement <2 x double> [[COL_LOAD80]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT91:%.*]] = insertelement <2 x double> poison, double [[TMP67]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT92:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT91]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP68:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK90]], <2 x double> [[SPLAT_SPLAT92]], <2 x double> [[TMP66]])
-; CHECK-NEXT: [[TMP69:%.*]] = shufflevector <2 x double> [[TMP68]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP70:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP69]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP71:%.*]] = getelementptr double, ptr [[A]], i64 18
-; CHECK-NEXT: [[COL_LOAD93:%.*]] = load <2 x double>, ptr [[TMP71]], align 8
-; CHECK-NEXT: [[VEC_GEP94:%.*]] = getelementptr double, ptr [[TMP71]], i64 8
-; CHECK-NEXT: [[COL_LOAD95:%.*]] = load <2 x double>, ptr [[VEC_GEP94]], align 8
-; CHECK-NEXT: [[TMP72:%.*]] = getelementptr double, ptr [[B]], i64 2
-; CHECK-NEXT: [[COL_LOAD96:%.*]] = load <2 x double>, ptr [[TMP72]], align 8
-; CHECK-NEXT: [[VEC_GEP97:%.*]] = getelementptr double, ptr [[TMP72]], i64 8
-; CHECK-NEXT: [[COL_LOAD98:%.*]] = load <2 x double>, ptr [[VEC_GEP97]], align 8
-; CHECK-NEXT: [[BLOCK99:%.*]] = shufflevector <2 x double> [[TMP64]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK100:%.*]] = shufflevector <2 x double> [[COL_LOAD93]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP73:%.*]] = extractelement <2 x double> [[COL_LOAD96]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT101:%.*]] = insertelement <2 x double> poison, double [[TMP73]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT102:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT101]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP74:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK100]], <2 x double> [[SPLAT_SPLAT102]], <2 x double> [[BLOCK99]])
-; CHECK-NEXT: [[BLOCK103:%.*]] = shufflevector <2 x double> [[COL_LOAD95]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP75:%.*]] = extractelement <2 x double> [[COL_LOAD96]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT104:%.*]] = insertelement <2 x double> poison, double [[TMP75]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT105:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT104]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP76:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK103]], <2 x double> [[SPLAT_SPLAT105]], <2 x double> [[TMP74]])
-; CHECK-NEXT: [[TMP77:%.*]] = shufflevector <2 x double> [[TMP76]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP78:%.*]] = shufflevector <2 x double> [[TMP64]], <2 x double> [[TMP77]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK106:%.*]] = shufflevector <2 x double> [[TMP70]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK107:%.*]] = shufflevector <2 x double> [[COL_LOAD93]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP79:%.*]] = extractelement <2 x double> [[COL_LOAD98]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT108:%.*]] = insertelement <2 x double> poison, double [[TMP79]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT109:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT108]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP80:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK107]], <2 x double> [[SPLAT_SPLAT109]], <2 x double> [[BLOCK106]])
-; CHECK-NEXT: [[BLOCK110:%.*]] = shufflevector <2 x double> [[COL_LOAD95]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x double> [[COL_LOAD98]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT111:%.*]] = insertelement <2 x double> poison, double [[TMP81]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT112:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT111]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP82:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK110]], <2 x double> [[SPLAT_SPLAT112]], <2 x double> [[TMP80]])
-; CHECK-NEXT: [[TMP83:%.*]] = shufflevector <2 x double> [[TMP82]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP84:%.*]] = shufflevector <2 x double> [[TMP70]], <2 x double> [[TMP83]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP85:%.*]] = getelementptr double, ptr [[A]], i64 34
-; CHECK-NEXT: [[COL_LOAD113:%.*]] = load <2 x double>, ptr [[TMP85]], align 8
-; CHECK-NEXT: [[VEC_GEP114:%.*]] = getelementptr double, ptr [[TMP85]], i64 8
-; CHECK-NEXT: [[COL_LOAD115:%.*]] = load <2 x double>, ptr [[VEC_GEP114]], align 8
-; CHECK-NEXT: [[TMP86:%.*]] = getelementptr double, ptr [[B]], i64 4
-; CHECK-NEXT: [[COL_LOAD116:%.*]] = load <2 x double>, ptr [[TMP86]], align 8
-; CHECK-NEXT: [[VEC_GEP117:%.*]] = getelementptr double, ptr [[TMP86]], i64 8
-; CHECK-NEXT: [[COL_LOAD118:%.*]] = load <2 x double>, ptr [[VEC_GEP117]], align 8
-; CHECK-NEXT: [[BLOCK119:%.*]] = shufflevector <2 x double> [[TMP78]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK120:%.*]] = shufflevector <2 x double> [[COL_LOAD113]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP87:%.*]] = extractelement <2 x double> [[COL_LOAD116]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT121:%.*]] = insertelement <2 x double> poison, double [[TMP87]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT122:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT121]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP88:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK120]], <2 x double> [[SPLAT_SPLAT122]], <2 x double> [[BLOCK119]])
-; CHECK-NEXT: [[BLOCK123:%.*]] = shufflevector <2 x double> [[COL_LOAD115]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP89:%.*]] = extractelement <2 x double> [[COL_LOAD116]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT124:%.*]] = insertelement <2 x double> poison, double [[TMP89]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT125:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT124]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP90:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK123]], <2 x double> [[SPLAT_SPLAT125]], <2 x double> [[TMP88]])
-; CHECK-NEXT: [[TMP91:%.*]] = shufflevector <2 x double> [[TMP90]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP92:%.*]] = shufflevector <2 x double> [[TMP78]], <2 x double> [[TMP91]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK126:%.*]] = shufflevector <2 x double> [[TMP84]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK127:%.*]] = shufflevector <2 x double> [[COL_LOAD113]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP93:%.*]] = extractelement <2 x double> [[COL_LOAD118]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT128:%.*]] = insertelement <2 x double> poison, double [[TMP93]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT129:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT128]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP94:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK127]], <2 x double> [[SPLAT_SPLAT129]], <2 x double> [[BLOCK126]])
-; CHECK-NEXT: [[BLOCK130:%.*]] = shufflevector <2 x double> [[COL_LOAD115]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP95:%.*]] = extractelement <2 x double> [[COL_LOAD118]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT131:%.*]] = insertelement <2 x double> poison, double [[TMP95]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT132:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT131]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP96:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK130]], <2 x double> [[SPLAT_SPLAT132]], <2 x double> [[TMP94]])
-; CHECK-NEXT: [[TMP97:%.*]] = shufflevector <2 x double> [[TMP96]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP98:%.*]] = shufflevector <2 x double> [[TMP84]], <2 x double> [[TMP97]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP99:%.*]] = getelementptr double, ptr [[A]], i64 50
-; CHECK-NEXT: [[COL_LOAD133:%.*]] = load <2 x double>, ptr [[TMP99]], align 8
-; CHECK-NEXT: [[VEC_GEP134:%.*]] = getelementptr double, ptr [[TMP99]], i64 8
-; CHECK-NEXT: [[COL_LOAD135:%.*]] = load <2 x double>, ptr [[VEC_GEP134]], align 8
-; CHECK-NEXT: [[TMP100:%.*]] = getelementptr double, ptr [[B]], i64 6
-; CHECK-NEXT: [[COL_LOAD136:%.*]] = load <2 x double>, ptr [[TMP100]], align 8
-; CHECK-NEXT: [[VEC_GEP137:%.*]] = getelementptr double, ptr [[TMP100]], i64 8
-; CHECK-NEXT: [[COL_LOAD138:%.*]] = load <2 x double>, ptr [[VEC_GEP137]], align 8
-; CHECK-NEXT: [[BLOCK139:%.*]] = shufflevector <2 x double> [[TMP92]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK140:%.*]] = shufflevector <2 x double> [[COL_LOAD133]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP101:%.*]] = extractelement <2 x double> [[COL_LOAD136]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT141:%.*]] = insertelement <2 x double> poison, double [[TMP101]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT142:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT141]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP102:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK140]], <2 x double> [[SPLAT_SPLAT142]], <2 x double> [[BLOCK139]])
-; CHECK-NEXT: [[BLOCK143:%.*]] = shufflevector <2 x double> [[COL_LOAD135]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP103:%.*]] = extractelement <2 x double> [[COL_LOAD136]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT144:%.*]] = insertelement <2 x double> poison, double [[TMP103]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT145:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT144]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP104:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK143]], <2 x double> [[SPLAT_SPLAT145]], <2 x double> [[TMP102]])
-; CHECK-NEXT: [[TMP105:%.*]] = shufflevector <2 x double> [[TMP104]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP106:%.*]] = shufflevector <2 x double> [[TMP92]], <2 x double> [[TMP105]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK146:%.*]] = shufflevector <2 x double> [[TMP98]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK147:%.*]] = shufflevector <2 x double> [[COL_LOAD133]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP107:%.*]] = extractelement <2 x double> [[COL_LOAD138]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT148:%.*]] = insertelement <2 x double> poison, double [[TMP107]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT149:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT148]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP108:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK147]], <2 x double> [[SPLAT_SPLAT149]], <2 x double> [[BLOCK146]])
-; CHECK-NEXT: [[BLOCK150:%.*]] = shufflevector <2 x double> [[COL_LOAD135]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP109:%.*]] = extractelement <2 x double> [[COL_LOAD138]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT151:%.*]] = insertelement <2 x double> poison, double [[TMP109]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT152:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT151]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP110:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK150]], <2 x double> [[SPLAT_SPLAT152]], <2 x double> [[TMP108]])
-; CHECK-NEXT: [[TMP111:%.*]] = shufflevector <2 x double> [[TMP110]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP112:%.*]] = shufflevector <2 x double> [[TMP98]], <2 x double> [[TMP111]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP113:%.*]] = getelementptr double, ptr [[C]], i64 2
-; CHECK-NEXT: store <2 x double> [[TMP106]], ptr [[TMP113]], align 8
-; CHECK-NEXT: [[VEC_GEP153:%.*]] = getelementptr double, ptr [[TMP113]], i64 8
-; CHECK-NEXT: store <2 x double> [[TMP112]], ptr [[VEC_GEP153]], align 8
-; CHECK-NEXT: [[TMP114:%.*]] = getelementptr double, ptr [[A]], i64 4
-; CHECK-NEXT: [[COL_LOAD154:%.*]] = load <2 x double>, ptr [[TMP114]], align 8
-; CHECK-NEXT: [[VEC_GEP155:%.*]] = getelementptr double, ptr [[TMP114]], i64 8
-; CHECK-NEXT: [[COL_LOAD156:%.*]] = load <2 x double>, ptr [[VEC_GEP155]], align 8
-; CHECK-NEXT: [[TMP115:%.*]] = getelementptr double, ptr [[B]], i64 0
-; CHECK-NEXT: [[COL_LOAD157:%.*]] = load <2 x double>, ptr [[TMP115]], align 8
-; CHECK-NEXT: [[VEC_GEP158:%.*]] = getelementptr double, ptr [[TMP115]], i64 8
-; CHECK-NEXT: [[COL_LOAD159:%.*]] = load <2 x double>, ptr [[VEC_GEP158]], align 8
-; CHECK-NEXT: [[BLOCK160:%.*]] = shufflevector <2 x double> [[COL_LOAD154]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP116:%.*]] = extractelement <2 x double> [[COL_LOAD157]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT161:%.*]] = insertelement <2 x double> poison, double [[TMP116]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT162:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT161]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP117:%.*]] = fmul contract <2 x double> [[BLOCK160]], [[SPLAT_SPLAT162]]
-; CHECK-NEXT: [[BLOCK163:%.*]] = shufflevector <2 x double> [[COL_LOAD156]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP118:%.*]] = extractelement <2 x double> [[COL_LOAD157]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT164:%.*]] = insertelement <2 x double> poison, double [[TMP118]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT165:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT164]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP119:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK163]], <2 x double> [[SPLAT_SPLAT165]], <2 x double> [[TMP117]])
-; CHECK-NEXT: [[TMP120:%.*]] = shufflevector <2 x double> [[TMP119]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP121:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP120]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK166:%.*]] = shufflevector <2 x double> [[COL_LOAD154]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP122:%.*]] = extractelement <2 x double> [[COL_LOAD159]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT167:%.*]] = insertelement <2 x double> poison, double [[TMP122]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT168:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT167]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP123:%.*]] = fmul contract <2 x double> [[BLOCK166]], [[SPLAT_SPLAT168]]
-; CHECK-NEXT: [[BLOCK169:%.*]] = shufflevector <2 x double> [[COL_LOAD156]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP124:%.*]] = extractelement <2 x double> [[COL_LOAD159]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT170:%.*]] = insertelement <2 x double> poison, double [[TMP124]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT171:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT170]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP125:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK169]], <2 x double> [[SPLAT_SPLAT171]], <2 x double> [[TMP123]])
-; CHECK-NEXT: [[TMP126:%.*]] = shufflevector <2 x double> [[TMP125]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP127:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP126]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP128:%.*]] = getelementptr double, ptr [[A]], i64 20
-; CHECK-NEXT: [[COL_LOAD172:%.*]] = load <2 x double>, ptr [[TMP128]], align 8
-; CHECK-NEXT: [[VEC_GEP173:%.*]] = getelementptr double, ptr [[TMP128]], i64 8
-; CHECK-NEXT: [[COL_LOAD174:%.*]] = load <2 x double>, ptr [[VEC_GEP173]], align 8
-; CHECK-NEXT: [[TMP129:%.*]] = getelementptr double, ptr [[B]], i64 2
-; CHECK-NEXT: [[COL_LOAD175:%.*]] = load <2 x double>, ptr [[TMP129]], align 8
-; CHECK-NEXT: [[VEC_GEP176:%.*]] = getelementptr double, ptr [[TMP129]], i64 8
-; CHECK-NEXT: [[COL_LOAD177:%.*]] = load <2 x double>, ptr [[VEC_GEP176]], align 8
-; CHECK-NEXT: [[BLOCK178:%.*]] = shufflevector <2 x double> [[TMP121]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK179:%.*]] = shufflevector <2 x double> [[COL_LOAD172]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP130:%.*]] = extractelement <2 x double> [[COL_LOAD175]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT180:%.*]] = insertelement <2 x double> poison, double [[TMP130]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT181:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT180]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP131:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK179]], <2 x double> [[SPLAT_SPLAT181]], <2 x double> [[BLOCK178]])
-; CHECK-NEXT: [[BLOCK182:%.*]] = shufflevector <2 x double> [[COL_LOAD174]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP132:%.*]] = extractelement <2 x double> [[COL_LOAD175]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT183:%.*]] = insertelement <2 x double> poison, double [[TMP132]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT184:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT183]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP133:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK182]], <2 x double> [[SPLAT_SPLAT184]], <2 x double> [[TMP131]])
-; CHECK-NEXT: [[TMP134:%.*]] = shufflevector <2 x double> [[TMP133]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP135:%.*]] = shufflevector <2 x double> [[TMP121]], <2 x double> [[TMP134]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK185:%.*]] = shufflevector <2 x double> [[TMP127]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK186:%.*]] = shufflevector <2 x double> [[COL_LOAD172]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP136:%.*]] = extractelement <2 x double> [[COL_LOAD177]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT187:%.*]] = insertelement <2 x double> poison, double [[TMP136]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT188:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT187]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP137:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK186]], <2 x double> [[SPLAT_SPLAT188]], <2 x double> [[BLOCK185]])
-; CHECK-NEXT: [[BLOCK189:%.*]] = shufflevector <2 x double> [[COL_LOAD174]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP138:%.*]] = extractelement <2 x double> [[COL_LOAD177]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT190:%.*]] = insertelement <2 x double> poison, double [[TMP138]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT191:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT190]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP139:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK189]], <2 x double> [[SPLAT_SPLAT191]], <2 x double> [[TMP137]])
-; CHECK-NEXT: [[TMP140:%.*]] = shufflevector <2 x double> [[TMP139]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP141:%.*]] = shufflevector <2 x double> [[TMP127]], <2 x double> [[TMP140]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP142:%.*]] = getelementptr double, ptr [[A]], i64 36
-; CHECK-NEXT: [[COL_LOAD192:%.*]] = load <2 x double>, ptr [[TMP142]], align 8
-; CHECK-NEXT: [[VEC_GEP193:%.*]] = getelementptr double, ptr [[TMP142]], i64 8
-; CHECK-NEXT: [[COL_LOAD194:%.*]] = load <2 x double>, ptr [[VEC_GEP193]], align 8
-; CHECK-NEXT: [[TMP143:%.*]] = getelementptr double, ptr [[B]], i64 4
-; CHECK-NEXT: [[COL_LOAD195:%.*]] = load <2 x double>, ptr [[TMP143]], align 8
-; CHECK-NEXT: [[VEC_GEP196:%.*]] = getelementptr double, ptr [[TMP143]], i64 8
-; CHECK-NEXT: [[COL_LOAD197:%.*]] = load <2 x double>, ptr [[VEC_GEP196]], align 8
-; CHECK-NEXT: [[BLOCK198:%.*]] = shufflevector <2 x double> [[TMP135]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK199:%.*]] = shufflevector <2 x double> [[COL_LOAD192]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP144:%.*]] = extractelement <2 x double> [[COL_LOAD195]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT200:%.*]] = insertelement <2 x double> poison, double [[TMP144]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT201:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT200]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP145:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK199]], <2 x double> [[SPLAT_SPLAT201]], <2 x double> [[BLOCK198]])
-; CHECK-NEXT: [[BLOCK202:%.*]] = shufflevector <2 x double> [[COL_LOAD194]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP146:%.*]] = extractelement <2 x double> [[COL_LOAD195]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT203:%.*]] = insertelement <2 x double> poison, double [[TMP146]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT204:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT203]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP147:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK202]], <2 x double> [[SPLAT_SPLAT204]], <2 x double> [[TMP145]])
-; CHECK-NEXT: [[TMP148:%.*]] = shufflevector <2 x double> [[TMP147]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP149:%.*]] = shufflevector <2 x double> [[TMP135]], <2 x double> [[TMP148]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK205:%.*]] = shufflevector <2 x double> [[TMP141]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK206:%.*]] = shufflevector <2 x double> [[COL_LOAD192]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP150:%.*]] = extractelement <2 x double> [[COL_LOAD197]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT207:%.*]] = insertelement <2 x double> poison, double [[TMP150]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT208:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT207]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP151:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK206]], <2 x double> [[SPLAT_SPLAT208]], <2 x double> [[BLOCK205]])
-; CHECK-NEXT: [[BLOCK209:%.*]] = shufflevector <2 x double> [[COL_LOAD194]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP152:%.*]] = extractelement <2 x double> [[COL_LOAD197]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT210:%.*]] = insertelement <2 x double> poison, double [[TMP152]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT211:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT210]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP153:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK209]], <2 x double> [[SPLAT_SPLAT211]], <2 x double> [[TMP151]])
-; CHECK-NEXT: [[TMP154:%.*]] = shufflevector <2 x double> [[TMP153]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP155:%.*]] = shufflevector <2 x double> [[TMP141]], <2 x double> [[TMP154]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP156:%.*]] = getelementptr double, ptr [[A]], i64 52
-; CHECK-NEXT: [[COL_LOAD212:%.*]] = load <2 x double>, ptr [[TMP156]], align 8
-; CHECK-NEXT: [[VEC_GEP213:%.*]] = getelementptr double, ptr [[TMP156]], i64 8
-; CHECK-NEXT: [[COL_LOAD214:%.*]] = load <2 x double>, ptr [[VEC_GEP213]], align 8
-; CHECK-NEXT: [[TMP157:%.*]] = getelementptr double, ptr [[B]], i64 6
-; CHECK-NEXT: [[COL_LOAD215:%.*]] = load <2 x double>, ptr [[TMP157]], align 8
-; CHECK-NEXT: [[VEC_GEP216:%.*]] = getelementptr double, ptr [[TMP157]], i64 8
-; CHECK-NEXT: [[COL_LOAD217:%.*]] = load <2 x double>, ptr [[VEC_GEP216]], align 8
-; CHECK-NEXT: [[BLOCK218:%.*]] = shufflevector <2 x double> [[TMP149]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK219:%.*]] = shufflevector <2 x double> [[COL_LOAD212]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP158:%.*]] = extractelement <2 x double> [[COL_LOAD215]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT220:%.*]] = insertelement <2 x double> poison, double [[TMP158]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT221:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT220]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP159:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK219]], <2 x double> [[SPLAT_SPLAT221]], <2 x double> [[BLOCK218]])
-; CHECK-NEXT: [[BLOCK222:%.*]] = shufflevector <2 x double> [[COL_LOAD214]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP160:%.*]] = extractelement <2 x double> [[COL_LOAD215]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT223:%.*]] = insertelement <2 x double> poison, double [[TMP160]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT224:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT223]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP161:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK222]], <2 x double> [[SPLAT_SPLAT224]], <2 x double> [[TMP159]])
-; CHECK-NEXT: [[TMP162:%.*]] = shufflevector <2 x double> [[TMP161]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP163:%.*]] = shufflevector <2 x double> [[TMP149]], <2 x double> [[TMP162]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK225:%.*]] = shufflevector <2 x double> [[TMP155]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK226:%.*]] = shufflevector <2 x double> [[COL_LOAD212]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP164:%.*]] = extractelement <2 x double> [[COL_LOAD217]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT227:%.*]] = insertelement <2 x double> poison, double [[TMP164]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT228:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT227]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP165:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK226]], <2 x double> [[SPLAT_SPLAT228]], <2 x double> [[BLOCK225]])
-; CHECK-NEXT: [[BLOCK229:%.*]] = shufflevector <2 x double> [[COL_LOAD214]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP166:%.*]] = extractelement <2 x double> [[COL_LOAD217]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT230:%.*]] = insertelement <2 x double> poison, double [[TMP166]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT231:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT230]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP167:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK229]], <2 x double> [[SPLAT_SPLAT231]], <2 x double> [[TMP165]])
-; CHECK-NEXT: [[TMP168:%.*]] = shufflevector <2 x double> [[TMP167]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP169:%.*]] = shufflevector <2 x double> [[TMP155]], <2 x double> [[TMP168]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP170:%.*]] = getelementptr double, ptr [[C]], i64 4
-; CHECK-NEXT: store <2 x double> [[TMP163]], ptr [[TMP170]], align 8
-; CHECK-NEXT: [[VEC_GEP232:%.*]] = getelementptr double, ptr [[TMP170]], i64 8
-; CHECK-NEXT: store <2 x double> [[TMP169]], ptr [[VEC_GEP232]], align 8
-; CHECK-NEXT: [[TMP171:%.*]] = getelementptr double, ptr [[A]], i64 6
-; CHECK-NEXT: [[COL_LOAD233:%.*]] = load <2 x double>, ptr [[TMP171]], align 8
-; CHECK-NEXT: [[VEC_GEP234:%.*]] = getelementptr double, ptr [[TMP171]], i64 8
-; CHECK-NEXT: [[COL_LOAD235:%.*]] = load <2 x double>, ptr [[VEC_GEP234]], align 8
-; CHECK-NEXT: [[TMP172:%.*]] = getelementptr double, ptr [[B]], i64 0
-; CHECK-NEXT: [[COL_LOAD236:%.*]] = load <2 x double>, ptr [[TMP172]], align 8
-; CHECK-NEXT: [[VEC_GEP237:%.*]] = getelementptr double, ptr [[TMP172]], i64 8
-; CHECK-NEXT: [[COL_LOAD238:%.*]] = load <2 x double>, ptr [[VEC_GEP237]], align 8
-; CHECK-NEXT: [[BLOCK239:%.*]] = shufflevector <2 x double> [[COL_LOAD233]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP173:%.*]] = extractelement <2 x double> [[COL_LOAD236]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT240:%.*]] = insertelement <2 x double> poison, double [[TMP173]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT241:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT240]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP174:%.*]] = fmul contract <2 x double> [[BLOCK239]], [[SPLAT_SPLAT241]]
-; CHECK-NEXT: [[BLOCK242:%.*]] = shufflevector <2 x double> [[COL_LOAD235]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP175:%.*]] = extractelement <2 x double> [[COL_LOAD236]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT243:%.*]] = insertelement <2 x double> poison, double [[TMP175]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT244:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT243]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP176:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK242]], <2 x double> [[SPLAT_SPLAT244]], <2 x double> [[TMP174]])
-; CHECK-NEXT: [[TMP177:%.*]] = shufflevector <2 x double> [[TMP176]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP178:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP177]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK245:%.*]] = shufflevector <2 x double> [[COL_LOAD233]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP179:%.*]] = extractelement <2 x double> [[COL_LOAD238]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT246:%.*]] = insertelement <2 x double> poison, double [[TMP179]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT247:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT246]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP180:%.*]] = fmul contract <2 x double> [[BLOCK245]], [[SPLAT_SPLAT247]]
-; CHECK-NEXT: [[BLOCK248:%.*]] = shufflevector <2 x double> [[COL_LOAD235]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP181:%.*]] = extractelement <2 x double> [[COL_LOAD238]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT249:%.*]] = insertelement <2 x double> poison, double [[TMP181]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT250:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT249]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP182:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK248]], <2 x double> [[SPLAT_SPLAT250]], <2 x double> [[TMP180]])
-; CHECK-NEXT: [[TMP183:%.*]] = shufflevector <2 x double> [[TMP182]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP184:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP183]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP185:%.*]] = getelementptr double, ptr [[A]], i64 22
-; CHECK-NEXT: [[COL_LOAD251:%.*]] = load <2 x double>, ptr [[TMP185]], align 8
-; CHECK-NEXT: [[VEC_GEP252:%.*]] = getelementptr double, ptr [[TMP185]], i64 8
-; CHECK-NEXT: [[COL_LOAD253:%.*]] = load <2 x double>, ptr [[VEC_GEP252]], align 8
-; CHECK-NEXT: [[TMP186:%.*]] = getelementptr double, ptr [[B]], i64 2
-; CHECK-NEXT: [[COL_LOAD254:%.*]] = load <2 x double>, ptr [[TMP186]], align 8
-; CHECK-NEXT: [[VEC_GEP255:%.*]] = getelementptr double, ptr [[TMP186]], i64 8
-; CHECK-NEXT: [[COL_LOAD256:%.*]] = load <2 x double>, ptr [[VEC_GEP255]], align 8
-; CHECK-NEXT: [[BLOCK257:%.*]] = shufflevector <2 x double> [[TMP178]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK258:%.*]] = shufflevector <2 x double> [[COL_LOAD251]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP187:%.*]] = extractelement <2 x double> [[COL_LOAD254]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT259:%.*]] = insertelement <2 x double> poison, double [[TMP187]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT260:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT259]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP188:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK258]], <2 x double> [[SPLAT_SPLAT260]], <2 x double> [[BLOCK257]])
-; CHECK-NEXT: [[BLOCK261:%.*]] = shufflevector <2 x double> [[COL_LOAD253]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP189:%.*]] = extractelement <2 x double> [[COL_LOAD254]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT262:%.*]] = insertelement <2 x double> poison, double [[TMP189]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT263:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT262]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP190:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK261]], <2 x double> [[SPLAT_SPLAT263]], <2 x double> [[TMP188]])
-; CHECK-NEXT: [[TMP191:%.*]] = shufflevector <2 x double> [[TMP190]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP192:%.*]] = shufflevector <2 x double> [[TMP178]], <2 x double> [[TMP191]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK264:%.*]] = shufflevector <2 x double> [[TMP184]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK265:%.*]] = shufflevector <2 x double> [[COL_LOAD251]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP193:%.*]] = extractelement <2 x double> [[COL_LOAD256]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT266:%.*]] = insertelement <2 x double> poison, double [[TMP193]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT267:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT266]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP194:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK265]], <2 x double> [[SPLAT_SPLAT267]], <2 x double> [[BLOCK264]])
-; CHECK-NEXT: [[BLOCK268:%.*]] = shufflevector <2 x double> [[COL_LOAD253]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP195:%.*]] = extractelement <2 x double> [[COL_LOAD256]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT269:%.*]] = insertelement <2 x double> poison, double [[TMP195]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT270:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT269]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP196:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK268]], <2 x double> [[SPLAT_SPLAT270]], <2 x double> [[TMP194]])
-; CHECK-NEXT: [[TMP197:%.*]] = shufflevector <2 x double> [[TMP196]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP198:%.*]] = shufflevector <2 x double> [[TMP184]], <2 x double> [[TMP197]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP199:%.*]] = getelementptr double, ptr [[A]], i64 38
-; CHECK-NEXT: [[COL_LOAD271:%.*]] = load <2 x double>, ptr [[TMP199]], align 8
-; CHECK-NEXT: [[VEC_GEP272:%.*]] = getelementptr double, ptr [[TMP199]], i64 8
-; CHECK-NEXT: [[COL_LOAD273:%.*]] = load <2 x double>, ptr [[VEC_GEP272]], align 8
-; CHECK-NEXT: [[TMP200:%.*]] = getelementptr double, ptr [[B]], i64 4
-; CHECK-NEXT: [[COL_LOAD274:%.*]] = load <2 x double>, ptr [[TMP200]], align 8
-; CHECK-NEXT: [[VEC_GEP275:%.*]] = getelementptr double, ptr [[TMP200]], i64 8
-; CHECK-NEXT: [[COL_LOAD276:%.*]] = load <2 x double>, ptr [[VEC_GEP275]], align 8
-; CHECK-NEXT: [[BLOCK277:%.*]] = shufflevector <2 x double> [[TMP192]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK278:%.*]] = shufflevector <2 x double> [[COL_LOAD271]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP201:%.*]] = extractelement <2 x double> [[COL_LOAD274]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT279:%.*]] = insertelement <2 x double> poison, double [[TMP201]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT280:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT279]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP202:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK278]], <2 x double> [[SPLAT_SPLAT280]], <2 x double> [[BLOCK277]])
-; CHECK-NEXT: [[BLOCK281:%.*]] = shufflevector <2 x double> [[COL_LOAD273]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP203:%.*]] = extractelement <2 x double> [[COL_LOAD274]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT282:%.*]] = insertelement <2 x double> poison, double [[TMP203]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT283:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT282]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP204:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK281]], <2 x double> [[SPLAT_SPLAT283]], <2 x double> [[TMP202]])
-; CHECK-NEXT: [[TMP205:%.*]] = shufflevector <2 x double> [[TMP204]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP206:%.*]] = shufflevector <2 x double> [[TMP192]], <2 x double> [[TMP205]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK284:%.*]] = shufflevector <2 x double> [[TMP198]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK285:%.*]] = shufflevector <2 x double> [[COL_LOAD271]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP207:%.*]] = extractelement <2 x double> [[COL_LOAD276]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT286:%.*]] = insertelement <2 x double> poison, double [[TMP207]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT287:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT286]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP208:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK285]], <2 x double> [[SPLAT_SPLAT287]], <2 x double> [[BLOCK284]])
-; CHECK-NEXT: [[BLOCK288:%.*]] = shufflevector <2 x double> [[COL_LOAD273]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP209:%.*]] = extractelement <2 x double> [[COL_LOAD276]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT289:%.*]] = insertelement <2 x double> poison, double [[TMP209]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT290:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT289]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP210:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK288]], <2 x double> [[SPLAT_SPLAT290]], <2 x double> [[TMP208]])
-; CHECK-NEXT: [[TMP211:%.*]] = shufflevector <2 x double> [[TMP210]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP212:%.*]] = shufflevector <2 x double> [[TMP198]], <2 x double> [[TMP211]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP213:%.*]] = getelementptr double, ptr [[A]], i64 54
-; CHECK-NEXT: [[COL_LOAD291:%.*]] = load <2 x double>, ptr [[TMP213]], align 8
-; CHECK-NEXT: [[VEC_GEP292:%.*]] = getelementptr double, ptr [[TMP213]], i64 8
-; CHECK-NEXT: [[COL_LOAD293:%.*]] = load <2 x double>, ptr [[VEC_GEP292]], align 8
-; CHECK-NEXT: [[TMP214:%.*]] = getelementptr double, ptr [[B]], i64 6
-; CHECK-NEXT: [[COL_LOAD294:%.*]] = load <2 x double>, ptr [[TMP214]], align 8
-; CHECK-NEXT: [[VEC_GEP295:%.*]] = getelementptr double, ptr [[TMP214]], i64 8
-; CHECK-NEXT: [[COL_LOAD296:%.*]] = load <2 x double>, ptr [[VEC_GEP295]], align 8
-; CHECK-NEXT: [[BLOCK297:%.*]] = shufflevector <2 x double> [[TMP206]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK298:%.*]] = shufflevector <2 x double> [[COL_LOAD291]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP215:%.*]] = extractelement <2 x double> [[COL_LOAD294]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT299:%.*]] = insertelement <2 x double> poison, double [[TMP215]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT300:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT299]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP216:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK298]], <2 x double> [[SPLAT_SPLAT300]], <2 x double> [[BLOCK297]])
-; CHECK-NEXT: [[BLOCK301:%.*]] = shufflevector <2 x double> [[COL_LOAD293]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP217:%.*]] = extractelement <2 x double> [[COL_LOAD294]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT302:%.*]] = insertelement <2 x double> poison, double [[TMP217]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT303:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT302]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP218:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK301]], <2 x double> [[SPLAT_SPLAT303]], <2 x double> [[TMP216]])
-; CHECK-NEXT: [[TMP219:%.*]] = shufflevector <2 x double> [[TMP218]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP220:%.*]] = shufflevector <2 x double> [[TMP206]], <2 x double> [[TMP219]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK304:%.*]] = shufflevector <2 x double> [[TMP212]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK305:%.*]] = shufflevector <2 x double> [[COL_LOAD291]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP221:%.*]] = extractelement <2 x double> [[COL_LOAD296]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT306:%.*]] = insertelement <2 x double> poison, double [[TMP221]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT307:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT306]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP222:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK305]], <2 x double> [[SPLAT_SPLAT307]], <2 x double> [[BLOCK304]])
-; CHECK-NEXT: [[BLOCK308:%.*]] = shufflevector <2 x double> [[COL_LOAD293]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP223:%.*]] = extractelement <2 x double> [[COL_LOAD296]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT309:%.*]] = insertelement <2 x double> poison, double [[TMP223]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT310:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT309]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP224:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK308]], <2 x double> [[SPLAT_SPLAT310]], <2 x double> [[TMP222]])
-; CHECK-NEXT: [[TMP225:%.*]] = shufflevector <2 x double> [[TMP224]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP226:%.*]] = shufflevector <2 x double> [[TMP212]], <2 x double> [[TMP225]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP227:%.*]] = getelementptr double, ptr [[C]], i64 6
-; CHECK-NEXT: store <2 x double> [[TMP220]], ptr [[TMP227]], align 8
-; CHECK-NEXT: [[VEC_GEP311:%.*]] = getelementptr double, ptr [[TMP227]], i64 8
-; CHECK-NEXT: store <2 x double> [[TMP226]], ptr [[VEC_GEP311]], align 8
-; CHECK-NEXT: [[TMP228:%.*]] = getelementptr double, ptr [[A]], i64 0
-; CHECK-NEXT: [[COL_LOAD312:%.*]] = load <2 x double>, ptr [[TMP228]], align 8
-; CHECK-NEXT: [[VEC_GEP313:%.*]] = getelementptr double, ptr [[TMP228]], i64 8
-; CHECK-NEXT: [[COL_LOAD314:%.*]] = load <2 x double>, ptr [[VEC_GEP313]], align 8
-; CHECK-NEXT: [[TMP229:%.*]] = getelementptr double, ptr [[B]], i64 16
-; CHECK-NEXT: [[COL_LOAD315:%.*]] = load <2 x double>, ptr [[TMP229]], align 8
-; CHECK-NEXT: [[VEC_GEP316:%.*]] = getelementptr double, ptr [[TMP229]], i64 8
-; CHECK-NEXT: [[COL_LOAD317:%.*]] = load <2 x double>, ptr [[VEC_GEP316]], align 8
-; CHECK-NEXT: [[BLOCK318:%.*]] = shufflevector <2 x double> [[COL_LOAD312]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP230:%.*]] = extractelement <2 x double> [[COL_LOAD315]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT319:%.*]] = insertelement <2 x double> poison, double [[TMP230]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT320:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT319]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP231:%.*]] = fmul contract <2 x double> [[BLOCK318]], [[SPLAT_SPLAT320]]
-; CHECK-NEXT: [[BLOCK321:%.*]] = shufflevector <2 x double> [[COL_LOAD314]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP232:%.*]] = extractelement <2 x double> [[COL_LOAD315]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT322:%.*]] = insertelement <2 x double> poison, double [[TMP232]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT323:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT322]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP233:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK321]], <2 x double> [[SPLAT_SPLAT323]], <2 x double> [[TMP231]])
-; CHECK-NEXT: [[TMP234:%.*]] = shufflevector <2 x double> [[TMP233]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP235:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP234]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK324:%.*]] = shufflevector <2 x double> [[COL_LOAD312]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP236:%.*]] = extractelement <2 x double> [[COL_LOAD317]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT325:%.*]] = insertelement <2 x double> poison, double [[TMP236]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT326:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT325]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP237:%.*]] = fmul contract <2 x double> [[BLOCK324]], [[SPLAT_SPLAT326]]
-; CHECK-NEXT: [[BLOCK327:%.*]] = shufflevector <2 x double> [[COL_LOAD314]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP238:%.*]] = extractelement <2 x double> [[COL_LOAD317]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT328:%.*]] = insertelement <2 x double> poison, double [[TMP238]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT329:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT328]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP239:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK327]], <2 x double> [[SPLAT_SPLAT329]], <2 x double> [[TMP237]])
-; CHECK-NEXT: [[TMP240:%.*]] = shufflevector <2 x double> [[TMP239]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP241:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP240]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP242:%.*]] = getelementptr double, ptr [[A]], i64 16
-; CHECK-NEXT: [[COL_LOAD330:%.*]] = load <2 x double>, ptr [[TMP242]], align 8
-; CHECK-NEXT: [[VEC_GEP331:%.*]] = getelementptr double, ptr [[TMP242]], i64 8
-; CHECK-NEXT: [[COL_LOAD332:%.*]] = load <2 x double>, ptr [[VEC_GEP331]], align 8
-; CHECK-NEXT: [[TMP243:%.*]] = getelementptr double, ptr [[B]], i64 18
-; CHECK-NEXT: [[COL_LOAD333:%.*]] = load <2 x double>, ptr [[TMP243]], align 8
-; CHECK-NEXT: [[VEC_GEP334:%.*]] = getelementptr double, ptr [[TMP243]], i64 8
-; CHECK-NEXT: [[COL_LOAD335:%.*]] = load <2 x double>, ptr [[VEC_GEP334]], align 8
-; CHECK-NEXT: [[BLOCK336:%.*]] = shufflevector <2 x double> [[TMP235]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK337:%.*]] = shufflevector <2 x double> [[COL_LOAD330]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP244:%.*]] = extractelement <2 x double> [[COL_LOAD333]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT338:%.*]] = insertelement <2 x double> poison, double [[TMP244]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT339:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT338]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP245:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK337]], <2 x double> [[SPLAT_SPLAT339]], <2 x double> [[BLOCK336]])
-; CHECK-NEXT: [[BLOCK340:%.*]] = shufflevector <2 x double> [[COL_LOAD332]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP246:%.*]] = extractelement <2 x double> [[COL_LOAD333]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT341:%.*]] = insertelement <2 x double> poison, double [[TMP246]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT342:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT341]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP247:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK340]], <2 x double> [[SPLAT_SPLAT342]], <2 x double> [[TMP245]])
-; CHECK-NEXT: [[TMP248:%.*]] = shufflevector <2 x double> [[TMP247]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP249:%.*]] = shufflevector <2 x double> [[TMP235]], <2 x double> [[TMP248]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK343:%.*]] = shufflevector <2 x double> [[TMP241]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK344:%.*]] = shufflevector <2 x double> [[COL_LOAD330]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP250:%.*]] = extractelement <2 x double> [[COL_LOAD335]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT345:%.*]] = insertelement <2 x double> poison, double [[TMP250]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT346:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT345]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP251:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK344]], <2 x double> [[SPLAT_SPLAT346]], <2 x double> [[BLOCK343]])
-; CHECK-NEXT: [[BLOCK347:%.*]] = shufflevector <2 x double> [[COL_LOAD332]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP252:%.*]] = extractelement <2 x double> [[COL_LOAD335]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT348:%.*]] = insertelement <2 x double> poison, double [[TMP252]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT349:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT348]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP253:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK347]], <2 x double> [[SPLAT_SPLAT349]], <2 x double> [[TMP251]])
-; CHECK-NEXT: [[TMP254:%.*]] = shufflevector <2 x double> [[TMP253]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP255:%.*]] = shufflevector <2 x double> [[TMP241]], <2 x double> [[TMP254]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP256:%.*]] = getelementptr double, ptr [[A]], i64 32
-; CHECK-NEXT: [[COL_LOAD350:%.*]] = load <2 x double>, ptr [[TMP256]], align 8
-; CHECK-NEXT: [[VEC_GEP351:%.*]] = getelementptr double, ptr [[TMP256]], i64 8
-; CHECK-NEXT: [[COL_LOAD352:%.*]] = load <2 x double>, ptr [[VEC_GEP351]], align 8
-; CHECK-NEXT: [[TMP257:%.*]] = getelementptr double, ptr [[B]], i64 20
-; CHECK-NEXT: [[COL_LOAD353:%.*]] = load <2 x double>, ptr [[TMP257]], align 8
-; CHECK-NEXT: [[VEC_GEP354:%.*]] = getelementptr double, ptr [[TMP257]], i64 8
-; CHECK-NEXT: [[COL_LOAD355:%.*]] = load <2 x double>, ptr [[VEC_GEP354]], align 8
-; CHECK-NEXT: [[BLOCK356:%.*]] = shufflevector <2 x double> [[TMP249]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK357:%.*]] = shufflevector <2 x double> [[COL_LOAD350]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP258:%.*]] = extractelement <2 x double> [[COL_LOAD353]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT358:%.*]] = insertelement <2 x double> poison, double [[TMP258]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT359:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT358]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP259:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK357]], <2 x double> [[SPLAT_SPLAT359]], <2 x double> [[BLOCK356]])
-; CHECK-NEXT: [[BLOCK360:%.*]] = shufflevector <2 x double> [[COL_LOAD352]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP260:%.*]] = extractelement <2 x double> [[COL_LOAD353]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT361:%.*]] = insertelement <2 x double> poison, double [[TMP260]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT362:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT361]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP261:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK360]], <2 x double> [[SPLAT_SPLAT362]], <2 x double> [[TMP259]])
-; CHECK-NEXT: [[TMP262:%.*]] = shufflevector <2 x double> [[TMP261]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP263:%.*]] = shufflevector <2 x double> [[TMP249]], <2 x double> [[TMP262]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK363:%.*]] = shufflevector <2 x double> [[TMP255]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK364:%.*]] = shufflevector <2 x double> [[COL_LOAD350]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP264:%.*]] = extractelement <2 x double> [[COL_LOAD355]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT365:%.*]] = insertelement <2 x double> poison, double [[TMP264]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT366:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT365]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP265:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK364]], <2 x double> [[SPLAT_SPLAT366]], <2 x double> [[BLOCK363]])
-; CHECK-NEXT: [[BLOCK367:%.*]] = shufflevector <2 x double> [[COL_LOAD352]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP266:%.*]] = extractelement <2 x double> [[COL_LOAD355]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT368:%.*]] = insertelement <2 x double> poison, double [[TMP266]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT369:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT368]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP267:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK367]], <2 x double> [[SPLAT_SPLAT369]], <2 x double> [[TMP265]])
-; CHECK-NEXT: [[TMP268:%.*]] = shufflevector <2 x double> [[TMP267]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP269:%.*]] = shufflevector <2 x double> [[TMP255]], <2 x double> [[TMP268]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP270:%.*]] = getelementptr double, ptr [[A]], i64 48
-; CHECK-NEXT: [[COL_LOAD370:%.*]] = load <2 x double>, ptr [[TMP270]], align 8
-; CHECK-NEXT: [[VEC_GEP371:%.*]] = getelementptr double, ptr [[TMP270]], i64 8
-; CHECK-NEXT: [[COL_LOAD372:%.*]] = load <2 x double>, ptr [[VEC_GEP371]], align 8
-; CHECK-NEXT: [[TMP271:%.*]] = getelementptr double, ptr [[B]], i64 22
-; CHECK-NEXT: [[COL_LOAD373:%.*]] = load <2 x double>, ptr [[TMP271]], align 8
-; CHECK-NEXT: [[VEC_GEP374:%.*]] = getelementptr double, ptr [[TMP271]], i64 8
-; CHECK-NEXT: [[COL_LOAD375:%.*]] = load <2 x double>, ptr [[VEC_GEP374]], align 8
-; CHECK-NEXT: [[BLOCK376:%.*]] = shufflevector <2 x double> [[TMP263]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK377:%.*]] = shufflevector <2 x double> [[COL_LOAD370]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP272:%.*]] = extractelement <2 x double> [[COL_LOAD373]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT378:%.*]] = insertelement <2 x double> poison, double [[TMP272]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT379:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT378]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP273:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK377]], <2 x double> [[SPLAT_SPLAT379]], <2 x double> [[BLOCK376]])
-; CHECK-NEXT: [[BLOCK380:%.*]] = shufflevector <2 x double> [[COL_LOAD372]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP274:%.*]] = extractelement <2 x double> [[COL_LOAD373]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT381:%.*]] = insertelement <2 x double> poison, double [[TMP274]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT382:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT381]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP275:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK380]], <2 x double> [[SPLAT_SPLAT382]], <2 x double> [[TMP273]])
-; CHECK-NEXT: [[TMP276:%.*]] = shufflevector <2 x double> [[TMP275]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP277:%.*]] = shufflevector <2 x double> [[TMP263]], <2 x double> [[TMP276]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK383:%.*]] = shufflevector <2 x double> [[TMP269]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK384:%.*]] = shufflevector <2 x double> [[COL_LOAD370]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP278:%.*]] = extractelement <2 x double> [[COL_LOAD375]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT385:%.*]] = insertelement <2 x double> poison, double [[TMP278]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT386:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT385]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP279:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK384]], <2 x double> [[SPLAT_SPLAT386]], <2 x double> [[BLOCK383]])
-; CHECK-NEXT: [[BLOCK387:%.*]] = shufflevector <2 x double> [[COL_LOAD372]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP280:%.*]] = extractelement <2 x double> [[COL_LOAD375]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT388:%.*]] = insertelement <2 x double> poison, double [[TMP280]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT389:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT388]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP281:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK387]], <2 x double> [[SPLAT_SPLAT389]], <2 x double> [[TMP279]])
-; CHECK-NEXT: [[TMP282:%.*]] = shufflevector <2 x double> [[TMP281]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP283:%.*]] = shufflevector <2 x double> [[TMP269]], <2 x double> [[TMP282]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP284:%.*]] = getelementptr double, ptr [[C]], i64 16
-; CHECK-NEXT: store <2 x double> [[TMP277]], ptr [[TMP284]], align 8
-; CHECK-NEXT: [[VEC_GEP390:%.*]] = getelementptr double, ptr [[TMP284]], i64 8
-; CHECK-NEXT: store <2 x double> [[TMP283]], ptr [[VEC_GEP390]], align 8
-; CHECK-NEXT: [[TMP285:%.*]] = getelementptr double, ptr [[A]], i64 2
-; CHECK-NEXT: [[COL_LOAD391:%.*]] = load <2 x double>, ptr [[TMP285]], align 8
-; CHECK-NEXT: [[VEC_GEP392:%.*]] = getelementptr double, ptr [[TMP285]], i64 8
-; CHECK-NEXT: [[COL_LOAD393:%.*]] = load <2 x double>, ptr [[VEC_GEP392]], align 8
-; CHECK-NEXT: [[TMP286:%.*]] = getelementptr double, ptr [[B]], i64 16
-; CHECK-NEXT: [[COL_LOAD394:%.*]] = load <2 x double>, ptr [[TMP286]], align 8
-; CHECK-NEXT: [[VEC_GEP395:%.*]] = getelementptr double, ptr [[TMP286]], i64 8
-; CHECK-NEXT: [[COL_LOAD396:%.*]] = load <2 x double>, ptr [[VEC_GEP395]], align 8
-; CHECK-NEXT: [[BLOCK397:%.*]] = shufflevector <2 x double> [[COL_LOAD391]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP287:%.*]] = extractelement <2 x double> [[COL_LOAD394]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT398:%.*]] = insertelement <2 x double> poison, double [[TMP287]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT399:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT398]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP288:%.*]] = fmul contract <2 x double> [[BLOCK397]], [[SPLAT_SPLAT399]]
-; CHECK-NEXT: [[BLOCK400:%.*]] = shufflevector <2 x double> [[COL_LOAD393]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP289:%.*]] = extractelement <2 x double> [[COL_LOAD394]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT401:%.*]] = insertelement <2 x double> poison, double [[TMP289]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT402:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT401]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP290:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK400]], <2 x double> [[SPLAT_SPLAT402]], <2 x double> [[TMP288]])
-; CHECK-NEXT: [[TMP291:%.*]] = shufflevector <2 x double> [[TMP290]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP292:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP291]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK403:%.*]] = shufflevector <2 x double> [[COL_LOAD391]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP293:%.*]] = extractelement <2 x double> [[COL_LOAD396]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT404:%.*]] = insertelement <2 x double> poison, double [[TMP293]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT405:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT404]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP294:%.*]] = fmul contract <2 x double> [[BLOCK403]], [[SPLAT_SPLAT405]]
-; CHECK-NEXT: [[BLOCK406:%.*]] = shufflevector <2 x double> [[COL_LOAD393]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP295:%.*]] = extractelement <2 x double> [[COL_LOAD396]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT407:%.*]] = insertelement <2 x double> poison, double [[TMP295]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT408:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT407]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP296:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK406]], <2 x double> [[SPLAT_SPLAT408]], <2 x double> [[TMP294]])
-; CHECK-NEXT: [[TMP297:%.*]] = shufflevector <2 x double> [[TMP296]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP298:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP297]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP299:%.*]] = getelementptr double, ptr [[A]], i64 18
-; CHECK-NEXT: [[COL_LOAD409:%.*]] = load <2 x double>, ptr [[TMP299]], align 8
-; CHECK-NEXT: [[VEC_GEP410:%.*]] = getelementptr double, ptr [[TMP299]], i64 8
-; CHECK-NEXT: [[COL_LOAD411:%.*]] = load <2 x double>, ptr [[VEC_GEP410]], align 8
-; CHECK-NEXT: [[TMP300:%.*]] = getelementptr double, ptr [[B]], i64 18
-; CHECK-NEXT: [[COL_LOAD412:%.*]] = load <2 x double>, ptr [[TMP300]], align 8
-; CHECK-NEXT: [[VEC_GEP413:%.*]] = getelementptr double, ptr [[TMP300]], i64 8
-; CHECK-NEXT: [[COL_LOAD414:%.*]] = load <2 x double>, ptr [[VEC_GEP413]], align 8
-; CHECK-NEXT: [[BLOCK415:%.*]] = shufflevector <2 x double> [[TMP292]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK416:%.*]] = shufflevector <2 x double> [[COL_LOAD409]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP301:%.*]] = extractelement <2 x double> [[COL_LOAD412]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT417:%.*]] = insertelement <2 x double> poison, double [[TMP301]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT418:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT417]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP302:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK416]], <2 x double> [[SPLAT_SPLAT418]], <2 x double> [[BLOCK415]])
-; CHECK-NEXT: [[BLOCK419:%.*]] = shufflevector <2 x double> [[COL_LOAD411]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP303:%.*]] = extractelement <2 x double> [[COL_LOAD412]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT420:%.*]] = insertelement <2 x double> poison, double [[TMP303]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT421:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT420]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP304:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK419]], <2 x double> [[SPLAT_SPLAT421]], <2 x double> [[TMP302]])
-; CHECK-NEXT: [[TMP305:%.*]] = shufflevector <2 x double> [[TMP304]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP306:%.*]] = shufflevector <2 x double> [[TMP292]], <2 x double> [[TMP305]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK422:%.*]] = shufflevector <2 x double> [[TMP298]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK423:%.*]] = shufflevector <2 x double> [[COL_LOAD409]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP307:%.*]] = extractelement <2 x double> [[COL_LOAD414]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT424:%.*]] = insertelement <2 x double> poison, double [[TMP307]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT425:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT424]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP308:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK423]], <2 x double> [[SPLAT_SPLAT425]], <2 x double> [[BLOCK422]])
-; CHECK-NEXT: [[BLOCK426:%.*]] = shufflevector <2 x double> [[COL_LOAD411]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP309:%.*]] = extractelement <2 x double> [[COL_LOAD414]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT427:%.*]] = insertelement <2 x double> poison, double [[TMP309]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT428:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT427]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP310:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK426]], <2 x double> [[SPLAT_SPLAT428]], <2 x double> [[TMP308]])
-; CHECK-NEXT: [[TMP311:%.*]] = shufflevector <2 x double> [[TMP310]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP312:%.*]] = shufflevector <2 x double> [[TMP298]], <2 x double> [[TMP311]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP313:%.*]] = getelementptr double, ptr [[A]], i64 34
-; CHECK-NEXT: [[COL_LOAD429:%.*]] = load <2 x double>, ptr [[TMP313]], align 8
-; CHECK-NEXT: [[VEC_GEP430:%.*]] = getelementptr double, ptr [[TMP313]], i64 8
-; CHECK-NEXT: [[COL_LOAD431:%.*]] = load <2 x double>, ptr [[VEC_GEP430]], align 8
-; CHECK-NEXT: [[TMP314:%.*]] = getelementptr double, ptr [[B]], i64 20
-; CHECK-NEXT: [[COL_LOAD432:%.*]] = load <2 x double>, ptr [[TMP314]], align 8
-; CHECK-NEXT: [[VEC_GEP433:%.*]] = getelementptr double, ptr [[TMP314]], i64 8
-; CHECK-NEXT: [[COL_LOAD434:%.*]] = load <2 x double>, ptr [[VEC_GEP433]], align 8
-; CHECK-NEXT: [[BLOCK435:%.*]] = shufflevector <2 x double> [[TMP306]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK436:%.*]] = shufflevector <2 x double> [[COL_LOAD429]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP315:%.*]] = extractelement <2 x double> [[COL_LOAD432]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT437:%.*]] = insertelement <2 x double> poison, double [[TMP315]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT438:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT437]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP316:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK436]], <2 x double> [[SPLAT_SPLAT438]], <2 x double> [[BLOCK435]])
-; CHECK-NEXT: [[BLOCK439:%.*]] = shufflevector <2 x double> [[COL_LOAD431]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP317:%.*]] = extractelement <2 x double> [[COL_LOAD432]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT440:%.*]] = insertelement <2 x double> poison, double [[TMP317]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT441:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT440]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP318:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK439]], <2 x double> [[SPLAT_SPLAT441]], <2 x double> [[TMP316]])
-; CHECK-NEXT: [[TMP319:%.*]] = shufflevector <2 x double> [[TMP318]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP320:%.*]] = shufflevector <2 x double> [[TMP306]], <2 x double> [[TMP319]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK442:%.*]] = shufflevector <2 x double> [[TMP312]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK443:%.*]] = shufflevector <2 x double> [[COL_LOAD429]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP321:%.*]] = extractelement <2 x double> [[COL_LOAD434]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT444:%.*]] = insertelement <2 x double> poison, double [[TMP321]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT445:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT444]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP322:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK443]], <2 x double> [[SPLAT_SPLAT445]], <2 x double> [[BLOCK442]])
-; CHECK-NEXT: [[BLOCK446:%.*]] = shufflevector <2 x double> [[COL_LOAD431]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP323:%.*]] = extractelement <2 x double> [[COL_LOAD434]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT447:%.*]] = insertelement <2 x double> poison, double [[TMP323]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT448:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT447]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP324:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK446]], <2 x double> [[SPLAT_SPLAT448]], <2 x double> [[TMP322]])
-; CHECK-NEXT: [[TMP325:%.*]] = shufflevector <2 x double> [[TMP324]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP326:%.*]] = shufflevector <2 x double> [[TMP312]], <2 x double> [[TMP325]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP327:%.*]] = getelementptr double, ptr [[A]], i64 50
-; CHECK-NEXT: [[COL_LOAD449:%.*]] = load <2 x double>, ptr [[TMP327]], align 8
-; CHECK-NEXT: [[VEC_GEP450:%.*]] = getelementptr double, ptr [[TMP327]], i64 8
-; CHECK-NEXT: [[COL_LOAD451:%.*]] = load <2 x double>, ptr [[VEC_GEP450]], align 8
-; CHECK-NEXT: [[TMP328:%.*]] = getelementptr double, ptr [[B]], i64 22
-; CHECK-NEXT: [[COL_LOAD452:%.*]] = load <2 x double>, ptr [[TMP328]], align 8
-; CHECK-NEXT: [[VEC_GEP453:%.*]] = getelementptr double, ptr [[TMP328]], i64 8
-; CHECK-NEXT: [[COL_LOAD454:%.*]] = load <2 x double>, ptr [[VEC_GEP453]], align 8
-; CHECK-NEXT: [[BLOCK455:%.*]] = shufflevector <2 x double> [[TMP320]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK456:%.*]] = shufflevector <2 x double> [[COL_LOAD449]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP329:%.*]] = extractelement <2 x double> [[COL_LOAD452]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT457:%.*]] = insertelement <2 x double> poison, double [[TMP329]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT458:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT457]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP330:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK456]], <2 x double> [[SPLAT_SPLAT458]], <2 x double> [[BLOCK455]])
-; CHECK-NEXT: [[BLOCK459:%.*]] = shufflevector <2 x double> [[COL_LOAD451]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP331:%.*]] = extractelement <2 x double> [[COL_LOAD452]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT460:%.*]] = insertelement <2 x double> poison, double [[TMP331]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT461:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT460]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP332:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK459]], <2 x double> [[SPLAT_SPLAT461]], <2 x double> [[TMP330]])
-; CHECK-NEXT: [[TMP333:%.*]] = shufflevector <2 x double> [[TMP332]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP334:%.*]] = shufflevector <2 x double> [[TMP320]], <2 x double> [[TMP333]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK462:%.*]] = shufflevector <2 x double> [[TMP326]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK463:%.*]] = shufflevector <2 x double> [[COL_LOAD449]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP335:%.*]] = extractelement <2 x double> [[COL_LOAD454]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT464:%.*]] = insertelement <2 x double> poison, double [[TMP335]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT465:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT464]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP336:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK463]], <2 x double> [[SPLAT_SPLAT465]], <2 x double> [[BLOCK462]])
-; CHECK-NEXT: [[BLOCK466:%.*]] = shufflevector <2 x double> [[COL_LOAD451]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP337:%.*]] = extractelement <2 x double> [[COL_LOAD454]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT467:%.*]] = insertelement <2 x double> poison, double [[TMP337]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT468:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT467]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP338:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK466]], <2 x double> [[SPLAT_SPLAT468]], <2 x double> [[TMP336]])
-; CHECK-NEXT: [[TMP339:%.*]] = shufflevector <2 x double> [[TMP338]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP340:%.*]] = shufflevector <2 x double> [[TMP326]], <2 x double> [[TMP339]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP341:%.*]] = getelementptr double, ptr [[C]], i64 18
-; CHECK-NEXT: store <2 x double> [[TMP334]], ptr [[TMP341]], align 8
-; CHECK-NEXT: [[VEC_GEP469:%.*]] = getelementptr double, ptr [[TMP341]], i64 8
-; CHECK-NEXT: store <2 x double> [[TMP340]], ptr [[VEC_GEP469]], align 8
-; CHECK-NEXT: [[TMP342:%.*]] = getelementptr double, ptr [[A]], i64 4
-; CHECK-NEXT: [[COL_LOAD470:%.*]] = load <2 x double>, ptr [[TMP342]], align 8
-; CHECK-NEXT: [[VEC_GEP471:%.*]] = getelementptr double, ptr [[TMP342]], i64 8
-; CHECK-NEXT: [[COL_LOAD472:%.*]] = load <2 x double>, ptr [[VEC_GEP471]], align 8
-; CHECK-NEXT: [[TMP343:%.*]] = getelementptr double, ptr [[B]], i64 16
-; CHECK-NEXT: [[COL_LOAD473:%.*]] = load <2 x double>, ptr [[TMP343]], align 8
-; CHECK-NEXT: [[VEC_GEP474:%.*]] = getelementptr double, ptr [[TMP343]], i64 8
-; CHECK-NEXT: [[COL_LOAD475:%.*]] = load <2 x double>, ptr [[VEC_GEP474]], align 8
-; CHECK-NEXT: [[BLOCK476:%.*]] = shufflevector <2 x double> [[COL_LOAD470]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP344:%.*]] = extractelement <2 x double> [[COL_LOAD473]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT477:%.*]] = insertelement <2 x double> poison, double [[TMP344]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT478:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT477]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP345:%.*]] = fmul contract <2 x double> [[BLOCK476]], [[SPLAT_SPLAT478]]
-; CHECK-NEXT: [[BLOCK479:%.*]] = shufflevector <2 x double> [[COL_LOAD472]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP346:%.*]] = extractelement <2 x double> [[COL_LOAD473]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT480:%.*]] = insertelement <2 x double> poison, double [[TMP346]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT481:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT480]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP347:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK479]], <2 x double> [[SPLAT_SPLAT481]], <2 x double> [[TMP345]])
-; CHECK-NEXT: [[TMP348:%.*]] = shufflevector <2 x double> [[TMP347]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP349:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP348]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK482:%.*]] = shufflevector <2 x double> [[COL_LOAD470]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP350:%.*]] = extractelement <2 x double> [[COL_LOAD475]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT483:%.*]] = insertelement <2 x double> poison, double [[TMP350]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT484:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT483]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP351:%.*]] = fmul contract <2 x double> [[BLOCK482]], [[SPLAT_SPLAT484]]
-; CHECK-NEXT: [[BLOCK485:%.*]] = shufflevector <2 x double> [[COL_LOAD472]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP352:%.*]] = extractelement <2 x double> [[COL_LOAD475]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT486:%.*]] = insertelement <2 x double> poison, double [[TMP352]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT487:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT486]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP353:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK485]], <2 x double> [[SPLAT_SPLAT487]], <2 x double> [[TMP351]])
-; CHECK-NEXT: [[TMP354:%.*]] = shufflevector <2 x double> [[TMP353]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP355:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP354]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP356:%.*]] = getelementptr double, ptr [[A]], i64 20
-; CHECK-NEXT: [[COL_LOAD488:%.*]] = load <2 x double>, ptr [[TMP356]], align 8
-; CHECK-NEXT: [[VEC_GEP489:%.*]] = getelementptr double, ptr [[TMP356]], i64 8
-; CHECK-NEXT: [[COL_LOAD490:%.*]] = load <2 x double>, ptr [[VEC_GEP489]], align 8
-; CHECK-NEXT: [[TMP357:%.*]] = getelementptr double, ptr [[B]], i64 18
-; CHECK-NEXT: [[COL_LOAD491:%.*]] = load <2 x double>, ptr [[TMP357]], align 8
-; CHECK-NEXT: [[VEC_GEP492:%.*]] = getelementptr double, ptr [[TMP357]], i64 8
-; CHECK-NEXT: [[COL_LOAD493:%.*]] = load <2 x double>, ptr [[VEC_GEP492]], align 8
-; CHECK-NEXT: [[BLOCK494:%.*]] = shufflevector <2 x double> [[TMP349]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK495:%.*]] = shufflevector <2 x double> [[COL_LOAD488]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP358:%.*]] = extractelement <2 x double> [[COL_LOAD491]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT496:%.*]] = insertelement <2 x double> poison, double [[TMP358]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT497:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT496]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP359:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK495]], <2 x double> [[SPLAT_SPLAT497]], <2 x double> [[BLOCK494]])
-; CHECK-NEXT: [[BLOCK498:%.*]] = shufflevector <2 x double> [[COL_LOAD490]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP360:%.*]] = extractelement <2 x double> [[COL_LOAD491]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT499:%.*]] = insertelement <2 x double> poison, double [[TMP360]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT500:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT499]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP361:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK498]], <2 x double> [[SPLAT_SPLAT500]], <2 x double> [[TMP359]])
-; CHECK-NEXT: [[TMP362:%.*]] = shufflevector <2 x double> [[TMP361]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP363:%.*]] = shufflevector <2 x double> [[TMP349]], <2 x double> [[TMP362]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK501:%.*]] = shufflevector <2 x double> [[TMP355]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK502:%.*]] = shufflevector <2 x double> [[COL_LOAD488]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP364:%.*]] = extractelement <2 x double> [[COL_LOAD493]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT503:%.*]] = insertelement <2 x double> poison, double [[TMP364]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT504:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT503]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP365:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK502]], <2 x double> [[SPLAT_SPLAT504]], <2 x double> [[BLOCK501]])
-; CHECK-NEXT: [[BLOCK505:%.*]] = shufflevector <2 x double> [[COL_LOAD490]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP366:%.*]] = extractelement <2 x double> [[COL_LOAD493]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT506:%.*]] = insertelement <2 x double> poison, double [[TMP366]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT507:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT506]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP367:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK505]], <2 x double> [[SPLAT_SPLAT507]], <2 x double> [[TMP365]])
-; CHECK-NEXT: [[TMP368:%.*]] = shufflevector <2 x double> [[TMP367]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP369:%.*]] = shufflevector <2 x double> [[TMP355]], <2 x double> [[TMP368]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP370:%.*]] = getelementptr double, ptr [[A]], i64 36
-; CHECK-NEXT: [[COL_LOAD508:%.*]] = load <2 x double>, ptr [[TMP370]], align 8
-; CHECK-NEXT: [[VEC_GEP509:%.*]] = getelementptr double, ptr [[TMP370]], i64 8
-; CHECK-NEXT: [[COL_LOAD510:%.*]] = load <2 x double>, ptr [[VEC_GEP509]], align 8
-; CHECK-NEXT: [[TMP371:%.*]] = getelementptr double, ptr [[B]], i64 20
-; CHECK-NEXT: [[COL_LOAD511:%.*]] = load <2 x double>, ptr [[TMP371]], align 8
-; CHECK-NEXT: [[VEC_GEP512:%.*]] = getelementptr double, ptr [[TMP371]], i64 8
-; CHECK-NEXT: [[COL_LOAD513:%.*]] = load <2 x double>, ptr [[VEC_GEP512]], align 8
-; CHECK-NEXT: [[BLOCK514:%.*]] = shufflevector <2 x double> [[TMP363]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK515:%.*]] = shufflevector <2 x double> [[COL_LOAD508]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP372:%.*]] = extractelement <2 x double> [[COL_LOAD511]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT516:%.*]] = insertelement <2 x double> poison, double [[TMP372]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT517:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT516]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP373:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK515]], <2 x double> [[SPLAT_SPLAT517]], <2 x double> [[BLOCK514]])
-; CHECK-NEXT: [[BLOCK518:%.*]] = shufflevector <2 x double> [[COL_LOAD510]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP374:%.*]] = extractelement <2 x double> [[COL_LOAD511]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT519:%.*]] = insertelement <2 x double> poison, double [[TMP374]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT520:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT519]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP375:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK518]], <2 x double> [[SPLAT_SPLAT520]], <2 x double> [[TMP373]])
-; CHECK-NEXT: [[TMP376:%.*]] = shufflevector <2 x double> [[TMP375]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP377:%.*]] = shufflevector <2 x double> [[TMP363]], <2 x double> [[TMP376]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK521:%.*]] = shufflevector <2 x double> [[TMP369]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK522:%.*]] = shufflevector <2 x double> [[COL_LOAD508]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP378:%.*]] = extractelement <2 x double> [[COL_LOAD513]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT523:%.*]] = insertelement <2 x double> poison, double [[TMP378]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT524:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT523]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP379:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK522]], <2 x double> [[SPLAT_SPLAT524]], <2 x double> [[BLOCK521]])
-; CHECK-NEXT: [[BLOCK525:%.*]] = shufflevector <2 x double> [[COL_LOAD510]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP380:%.*]] = extractelement <2 x double> [[COL_LOAD513]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT526:%.*]] = insertelement <2 x double> poison, double [[TMP380]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT527:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT526]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP381:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK525]], <2 x double> [[SPLAT_SPLAT527]], <2 x double> [[TMP379]])
-; CHECK-NEXT: [[TMP382:%.*]] = shufflevector <2 x double> [[TMP381]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP383:%.*]] = shufflevector <2 x double> [[TMP369]], <2 x double> [[TMP382]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP384:%.*]] = getelementptr double, ptr [[A]], i64 52
-; CHECK-NEXT: [[COL_LOAD528:%.*]] = load <2 x double>, ptr [[TMP384]], align 8
-; CHECK-NEXT: [[VEC_GEP529:%.*]] = getelementptr double, ptr [[TMP384]], i64 8
-; CHECK-NEXT: [[COL_LOAD530:%.*]] = load <2 x double>, ptr [[VEC_GEP529]], align 8
-; CHECK-NEXT: [[TMP385:%.*]] = getelementptr double, ptr [[B]], i64 22
-; CHECK-NEXT: [[COL_LOAD531:%.*]] = load <2 x double>, ptr [[TMP385]], align 8
-; CHECK-NEXT: [[VEC_GEP532:%.*]] = getelementptr double, ptr [[TMP385]], i64 8
-; CHECK-NEXT: [[COL_LOAD533:%.*]] = load <2 x double>, ptr [[VEC_GEP532]], align 8
-; CHECK-NEXT: [[BLOCK534:%.*]] = shufflevector <2 x double> [[TMP377]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK535:%.*]] = shufflevector <2 x double> [[COL_LOAD528]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP386:%.*]] = extractelement <2 x double> [[COL_LOAD531]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT536:%.*]] = insertelement <2 x double> poison, double [[TMP386]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT537:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT536]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP387:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK535]], <2 x double> [[SPLAT_SPLAT537]], <2 x double> [[BLOCK534]])
-; CHECK-NEXT: [[BLOCK538:%.*]] = shufflevector <2 x double> [[COL_LOAD530]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP388:%.*]] = extractelement <2 x double> [[COL_LOAD531]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT539:%.*]] = insertelement <2 x double> poison, double [[TMP388]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT540:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT539]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP389:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK538]], <2 x double> [[SPLAT_SPLAT540]], <2 x double> [[TMP387]])
-; CHECK-NEXT: [[TMP390:%.*]] = shufflevector <2 x double> [[TMP389]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP391:%.*]] = shufflevector <2 x double> [[TMP377]], <2 x double> [[TMP390]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK541:%.*]] = shufflevector <2 x double> [[TMP383]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK542:%.*]] = shufflevector <2 x double> [[COL_LOAD528]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP392:%.*]] = extractelement <2 x double> [[COL_LOAD533]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT543:%.*]] = insertelement <2 x double> poison, double [[TMP392]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT544:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT543]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP393:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK542]], <2 x double> [[SPLAT_SPLAT544]], <2 x double> [[BLOCK541]])
-; CHECK-NEXT: [[BLOCK545:%.*]] = shufflevector <2 x double> [[COL_LOAD530]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP394:%.*]] = extractelement <2 x double> [[COL_LOAD533]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT546:%.*]] = insertelement <2 x double> poison, double [[TMP394]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT547:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT546]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP395:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK545]], <2 x double> [[SPLAT_SPLAT547]], <2 x double> [[TMP393]])
-; CHECK-NEXT: [[TMP396:%.*]] = shufflevector <2 x double> [[TMP395]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP397:%.*]] = shufflevector <2 x double> [[TMP383]], <2 x double> [[TMP396]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP398:%.*]] = getelementptr double, ptr [[C]], i64 20
-; CHECK-NEXT: store <2 x double> [[TMP391]], ptr [[TMP398]], align 8
-; CHECK-NEXT: [[VEC_GEP548:%.*]] = getelementptr double, ptr [[TMP398]], i64 8
-; CHECK-NEXT: store <2 x double> [[TMP397]], ptr [[VEC_GEP548]], align 8
-; CHECK-NEXT: [[TMP399:%.*]] = getelementptr double, ptr [[A]], i64 6
-; CHECK-NEXT: [[COL_LOAD549:%.*]] = load <2 x double>, ptr [[TMP399]], align 8
-; CHECK-NEXT: [[VEC_GEP550:%.*]] = getelementptr double, ptr [[TMP399]], i64 8
-; CHECK-NEXT: [[COL_LOAD551:%.*]] = load <2 x double>, ptr [[VEC_GEP550]], align 8
-; CHECK-NEXT: [[TMP400:%.*]] = getelementptr double, ptr [[B]], i64 16
-; CHECK-NEXT: [[COL_LOAD552:%.*]] = load <2 x double>, ptr [[TMP400]], align 8
-; CHECK-NEXT: [[VEC_GEP553:%.*]] = getelementptr double, ptr [[TMP400]], i64 8
-; CHECK-NEXT: [[COL_LOAD554:%.*]] = load <2 x double>, ptr [[VEC_GEP553]], align 8
-; CHECK-NEXT: [[BLOCK555:%.*]] = shufflevector <2 x double> [[COL_LOAD549]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP401:%.*]] = extractelement <2 x double> [[COL_LOAD552]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT556:%.*]] = insertelement <2 x double> poison, double [[TMP401]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT557:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT556]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP402:%.*]] = fmul contract <2 x double> [[BLOCK555]], [[SPLAT_SPLAT557]]
-; CHECK-NEXT: [[BLOCK558:%.*]] = shufflevector <2 x double> [[COL_LOAD551]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP403:%.*]] = extractelement <2 x double> [[COL_LOAD552]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT559:%.*]] = insertelement <2 x double> poison, double [[TMP403]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT560:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT559]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP404:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK558]], <2 x double> [[SPLAT_SPLAT560]], <2 x double> [[TMP402]])
-; CHECK-NEXT: [[TMP405:%.*]] = shufflevector <2 x double> [[TMP404]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP406:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP405]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK561:%.*]] = shufflevector <2 x double> [[COL_LOAD549]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP407:%.*]] = extractelement <2 x double> [[COL_LOAD554]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT562:%.*]] = insertelement <2 x double> poison, double [[TMP407]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT563:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT562]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP408:%.*]] = fmul contract <2 x double> [[BLOCK561]], [[SPLAT_SPLAT563]]
-; CHECK-NEXT: [[BLOCK564:%.*]] = shufflevector <2 x double> [[COL_LOAD551]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP409:%.*]] = extractelement <2 x double> [[COL_LOAD554]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT565:%.*]] = insertelement <2 x double> poison, double [[TMP409]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT566:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT565]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP410:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK564]], <2 x double> [[SPLAT_SPLAT566]], <2 x double> [[TMP408]])
-; CHECK-NEXT: [[TMP411:%.*]] = shufflevector <2 x double> [[TMP410]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP412:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP411]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP413:%.*]] = getelementptr double, ptr [[A]], i64 22
-; CHECK-NEXT: [[COL_LOAD567:%.*]] = load <2 x double>, ptr [[TMP413]], align 8
-; CHECK-NEXT: [[VEC_GEP568:%.*]] = getelementptr double, ptr [[TMP413]], i64 8
-; CHECK-NEXT: [[COL_LOAD569:%.*]] = load <2 x double>, ptr [[VEC_GEP568]], align 8
-; CHECK-NEXT: [[TMP414:%.*]] = getelementptr double, ptr [[B]], i64 18
-; CHECK-NEXT: [[COL_LOAD570:%.*]] = load <2 x double>, ptr [[TMP414]], align 8
-; CHECK-NEXT: [[VEC_GEP571:%.*]] = getelementptr double, ptr [[TMP414]], i64 8
-; CHECK-NEXT: [[COL_LOAD572:%.*]] = load <2 x double>, ptr [[VEC_GEP571]], align 8
-; CHECK-NEXT: [[BLOCK573:%.*]] = shufflevector <2 x double> [[TMP406]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK574:%.*]] = shufflevector <2 x double> [[COL_LOAD567]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP415:%.*]] = extractelement <2 x double> [[COL_LOAD570]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT575:%.*]] = insertelement <2 x double> poison, double [[TMP415]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT576:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT575]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP416:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK574]], <2 x double> [[SPLAT_SPLAT576]], <2 x double> [[BLOCK573]])
-; CHECK-NEXT: [[BLOCK577:%.*]] = shufflevector <2 x double> [[COL_LOAD569]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP417:%.*]] = extractelement <2 x double> [[COL_LOAD570]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT578:%.*]] = insertelement <2 x double> poison, double [[TMP417]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT579:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT578]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP418:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK577]], <2 x double> [[SPLAT_SPLAT579]], <2 x double> [[TMP416]])
-; CHECK-NEXT: [[TMP419:%.*]] = shufflevector <2 x double> [[TMP418]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP420:%.*]] = shufflevector <2 x double> [[TMP406]], <2 x double> [[TMP419]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK580:%.*]] = shufflevector <2 x double> [[TMP412]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK581:%.*]] = shufflevector <2 x double> [[COL_LOAD567]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP421:%.*]] = extractelement <2 x double> [[COL_LOAD572]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT582:%.*]] = insertelement <2 x double> poison, double [[TMP421]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT583:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT582]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP422:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK581]], <2 x double> [[SPLAT_SPLAT583]], <2 x double> [[BLOCK580]])
-; CHECK-NEXT: [[BLOCK584:%.*]] = shufflevector <2 x double> [[COL_LOAD569]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP423:%.*]] = extractelement <2 x double> [[COL_LOAD572]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT585:%.*]] = insertelement <2 x double> poison, double [[TMP423]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT586:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT585]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP424:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK584]], <2 x double> [[SPLAT_SPLAT586]], <2 x double> [[TMP422]])
-; CHECK-NEXT: [[TMP425:%.*]] = shufflevector <2 x double> [[TMP424]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP426:%.*]] = shufflevector <2 x double> [[TMP412]], <2 x double> [[TMP425]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP427:%.*]] = getelementptr double, ptr [[A]], i64 38
-; CHECK-NEXT: [[COL_LOAD587:%.*]] = load <2 x double>, ptr [[TMP427]], align 8
-; CHECK-NEXT: [[VEC_GEP588:%.*]] = getelementptr double, ptr [[TMP427]], i64 8
-; CHECK-NEXT: [[COL_LOAD589:%.*]] = load <2 x double>, ptr [[VEC_GEP588]], align 8
-; CHECK-NEXT: [[TMP428:%.*]] = getelementptr double, ptr [[B]], i64 20
-; CHECK-NEXT: [[COL_LOAD590:%.*]] = load <2 x double>, ptr [[TMP428]], align 8
-; CHECK-NEXT: [[VEC_GEP591:%.*]] = getelementptr double, ptr [[TMP428]], i64 8
-; CHECK-NEXT: [[COL_LOAD592:%.*]] = load <2 x double>, ptr [[VEC_GEP591]], align 8
-; CHECK-NEXT: [[BLOCK593:%.*]] = shufflevector <2 x double> [[TMP420]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK594:%.*]] = shufflevector <2 x double> [[COL_LOAD587]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP429:%.*]] = extractelement <2 x double> [[COL_LOAD590]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT595:%.*]] = insertelement <2 x double> poison, double [[TMP429]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT596:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT595]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP430:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK594]], <2 x double> [[SPLAT_SPLAT596]], <2 x double> [[BLOCK593]])
-; CHECK-NEXT: [[BLOCK597:%.*]] = shufflevector <2 x double> [[COL_LOAD589]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP431:%.*]] = extractelement <2 x double> [[COL_LOAD590]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT598:%.*]] = insertelement <2 x double> poison, double [[TMP431]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT599:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT598]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP432:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK597]], <2 x double> [[SPLAT_SPLAT599]], <2 x double> [[TMP430]])
-; CHECK-NEXT: [[TMP433:%.*]] = shufflevector <2 x double> [[TMP432]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP434:%.*]] = shufflevector <2 x double> [[TMP420]], <2 x double> [[TMP433]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK600:%.*]] = shufflevector <2 x double> [[TMP426]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK601:%.*]] = shufflevector <2 x double> [[COL_LOAD587]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP435:%.*]] = extractelement <2 x double> [[COL_LOAD592]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT602:%.*]] = insertelement <2 x double> poison, double [[TMP435]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT603:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT602]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP436:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK601]], <2 x double> [[SPLAT_SPLAT603]], <2 x double> [[BLOCK600]])
-; CHECK-NEXT: [[BLOCK604:%.*]] = shufflevector <2 x double> [[COL_LOAD589]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP437:%.*]] = extractelement <2 x double> [[COL_LOAD592]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT605:%.*]] = insertelement <2 x double> poison, double [[TMP437]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT606:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT605]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP438:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK604]], <2 x double> [[SPLAT_SPLAT606]], <2 x double> [[TMP436]])
-; CHECK-NEXT: [[TMP439:%.*]] = shufflevector <2 x double> [[TMP438]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP440:%.*]] = shufflevector <2 x double> [[TMP426]], <2 x double> [[TMP439]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP441:%.*]] = getelementptr double, ptr [[A]], i64 54
-; CHECK-NEXT: [[COL_LOAD607:%.*]] = load <2 x double>, ptr [[TMP441]], align 8
-; CHECK-NEXT: [[VEC_GEP608:%.*]] = getelementptr double, ptr [[TMP441]], i64 8
-; CHECK-NEXT: [[COL_LOAD609:%.*]] = load <2 x double>, ptr [[VEC_GEP608]], align 8
-; CHECK-NEXT: [[TMP442:%.*]] = getelementptr double, ptr [[B]], i64 22
-; CHECK-NEXT: [[COL_LOAD610:%.*]] = load <2 x double>, ptr [[TMP442]], align 8
-; CHECK-NEXT: [[VEC_GEP611:%.*]] = getelementptr double, ptr [[TMP442]], i64 8
-; CHECK-NEXT: [[COL_LOAD612:%.*]] = load <2 x double>, ptr [[VEC_GEP611]], align 8
-; CHECK-NEXT: [[BLOCK613:%.*]] = shufflevector <2 x double> [[TMP434]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK614:%.*]] = shufflevector <2 x double> [[COL_LOAD607]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP443:%.*]] = extractelement <2 x double> [[COL_LOAD610]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT615:%.*]] = insertelement <2 x double> poison, double [[TMP443]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT616:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT615]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP444:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK614]], <2 x double> [[SPLAT_SPLAT616]], <2 x double> [[BLOCK613]])
-; CHECK-NEXT: [[BLOCK617:%.*]] = shufflevector <2 x double> [[COL_LOAD609]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP445:%.*]] = extractelement <2 x double> [[COL_LOAD610]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT618:%.*]] = insertelement <2 x double> poison, double [[TMP445]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT619:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT618]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP446:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK617]], <2 x double> [[SPLAT_SPLAT619]], <2 x double> [[TMP444]])
-; CHECK-NEXT: [[TMP447:%.*]] = shufflevector <2 x double> [[TMP446]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP448:%.*]] = shufflevector <2 x double> [[TMP434]], <2 x double> [[TMP447]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK620:%.*]] = shufflevector <2 x double> [[TMP440]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK621:%.*]] = shufflevector <2 x double> [[COL_LOAD607]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP449:%.*]] = extractelement <2 x double> [[COL_LOAD612]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT622:%.*]] = insertelement <2 x double> poison, double [[TMP449]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT623:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT622]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP450:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK621]], <2 x double> [[SPLAT_SPLAT623]], <2 x double> [[BLOCK620]])
-; CHECK-NEXT: [[BLOCK624:%.*]] = shufflevector <2 x double> [[COL_LOAD609]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP451:%.*]] = extractelement <2 x double> [[COL_LOAD612]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT625:%.*]] = insertelement <2 x double> poison, double [[TMP451]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT626:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT625]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP452:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK624]], <2 x double> [[SPLAT_SPLAT626]], <2 x double> [[TMP450]])
-; CHECK-NEXT: [[TMP453:%.*]] = shufflevector <2 x double> [[TMP452]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP454:%.*]] = shufflevector <2 x double> [[TMP440]], <2 x double> [[TMP453]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP455:%.*]] = getelementptr double, ptr [[C]], i64 22
-; CHECK-NEXT: store <2 x double> [[TMP448]], ptr [[TMP455]], align 8
-; CHECK-NEXT: [[VEC_GEP627:%.*]] = getelementptr double, ptr [[TMP455]], i64 8
-; CHECK-NEXT: store <2 x double> [[TMP454]], ptr [[VEC_GEP627]], align 8
-; CHECK-NEXT: [[TMP456:%.*]] = getelementptr double, ptr [[A]], i64 0
-; CHECK-NEXT: [[COL_LOAD628:%.*]] = load <2 x double>, ptr [[TMP456]], align 8
-; CHECK-NEXT: [[VEC_GEP629:%.*]] = getelementptr double, ptr [[TMP456]], i64 8
-; CHECK-NEXT: [[COL_LOAD630:%.*]] = load <2 x double>, ptr [[VEC_GEP629]], align 8
-; CHECK-NEXT: [[TMP457:%.*]] = getelementptr double, ptr [[B]], i64 32
-; CHECK-NEXT: [[COL_LOAD631:%.*]] = load <2 x double>, ptr [[TMP457]], align 8
-; CHECK-NEXT: [[VEC_GEP632:%.*]] = getelementptr double, ptr [[TMP457]], i64 8
-; CHECK-NEXT: [[COL_LOAD633:%.*]] = load <2 x double>, ptr [[VEC_GEP632]], align 8
-; CHECK-NEXT: [[BLOCK634:%.*]] = shufflevector <2 x double> [[COL_LOAD628]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP458:%.*]] = extractelement <2 x double> [[COL_LOAD631]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT635:%.*]] = insertelement <2 x double> poison, double [[TMP458]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT636:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT635]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP459:%.*]] = fmul contract <2 x double> [[BLOCK634]], [[SPLAT_SPLAT636]]
-; CHECK-NEXT: [[BLOCK637:%.*]] = shufflevector <2 x double> [[COL_LOAD630]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP460:%.*]] = extractelement <2 x double> [[COL_LOAD631]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT638:%.*]] = insertelement <2 x double> poison, double [[TMP460]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT639:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT638]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP461:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK637]], <2 x double> [[SPLAT_SPLAT639]], <2 x double> [[TMP459]])
-; CHECK-NEXT: [[TMP462:%.*]] = shufflevector <2 x double> [[TMP461]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP463:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP462]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK640:%.*]] = shufflevector <2 x double> [[COL_LOAD628]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP464:%.*]] = extractelement <2 x double> [[COL_LOAD633]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT641:%.*]] = insertelement <2 x double> poison, double [[TMP464]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT642:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT641]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP465:%.*]] = fmul contract <2 x double> [[BLOCK640]], [[SPLAT_SPLAT642]]
-; CHECK-NEXT: [[BLOCK643:%.*]] = shufflevector <2 x double> [[COL_LOAD630]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP466:%.*]] = extractelement <2 x double> [[COL_LOAD633]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT644:%.*]] = insertelement <2 x double> poison, double [[TMP466]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT645:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT644]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP467:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK643]], <2 x double> [[SPLAT_SPLAT645]], <2 x double> [[TMP465]])
-; CHECK-NEXT: [[TMP468:%.*]] = shufflevector <2 x double> [[TMP467]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP469:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP468]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP470:%.*]] = getelementptr double, ptr [[A]], i64 16
-; CHECK-NEXT: [[COL_LOAD646:%.*]] = load <2 x double>, ptr [[TMP470]], align 8
-; CHECK-NEXT: [[VEC_GEP647:%.*]] = getelementptr double, ptr [[TMP470]], i64 8
-; CHECK-NEXT: [[COL_LOAD648:%.*]] = load <2 x double>, ptr [[VEC_GEP647]], align 8
-; CHECK-NEXT: [[TMP471:%.*]] = getelementptr double, ptr [[B]], i64 34
-; CHECK-NEXT: [[COL_LOAD649:%.*]] = load <2 x double>, ptr [[TMP471]], align 8
-; CHECK-NEXT: [[VEC_GEP650:%.*]] = getelementptr double, ptr [[TMP471]], i64 8
-; CHECK-NEXT: [[COL_LOAD651:%.*]] = load <2 x double>, ptr [[VEC_GEP650]], align 8
-; CHECK-NEXT: [[BLOCK652:%.*]] = shufflevector <2 x double> [[TMP463]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK653:%.*]] = shufflevector <2 x double> [[COL_LOAD646]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP472:%.*]] = extractelement <2 x double> [[COL_LOAD649]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT654:%.*]] = insertelement <2 x double> poison, double [[TMP472]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT655:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT654]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP473:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK653]], <2 x double> [[SPLAT_SPLAT655]], <2 x double> [[BLOCK652]])
-; CHECK-NEXT: [[BLOCK656:%.*]] = shufflevector <2 x double> [[COL_LOAD648]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP474:%.*]] = extractelement <2 x double> [[COL_LOAD649]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT657:%.*]] = insertelement <2 x double> poison, double [[TMP474]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT658:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT657]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP475:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK656]], <2 x double> [[SPLAT_SPLAT658]], <2 x double> [[TMP473]])
-; CHECK-NEXT: [[TMP476:%.*]] = shufflevector <2 x double> [[TMP475]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP477:%.*]] = shufflevector <2 x double> [[TMP463]], <2 x double> [[TMP476]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK659:%.*]] = shufflevector <2 x double> [[TMP469]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK660:%.*]] = shufflevector <2 x double> [[COL_LOAD646]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP478:%.*]] = extractelement <2 x double> [[COL_LOAD651]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT661:%.*]] = insertelement <2 x double> poison, double [[TMP478]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT662:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT661]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP479:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK660]], <2 x double> [[SPLAT_SPLAT662]], <2 x double> [[BLOCK659]])
-; CHECK-NEXT: [[BLOCK663:%.*]] = shufflevector <2 x double> [[COL_LOAD648]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP480:%.*]] = extractelement <2 x double> [[COL_LOAD651]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT664:%.*]] = insertelement <2 x double> poison, double [[TMP480]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT665:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT664]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP481:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK663]], <2 x double> [[SPLAT_SPLAT665]], <2 x double> [[TMP479]])
-; CHECK-NEXT: [[TMP482:%.*]] = shufflevector <2 x double> [[TMP481]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP483:%.*]] = shufflevector <2 x double> [[TMP469]], <2 x double> [[TMP482]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP484:%.*]] = getelementptr double, ptr [[A]], i64 32
-; CHECK-NEXT: [[COL_LOAD666:%.*]] = load <2 x double>, ptr [[TMP484]], align 8
-; CHECK-NEXT: [[VEC_GEP667:%.*]] = getelementptr double, ptr [[TMP484]], i64 8
-; CHECK-NEXT: [[COL_LOAD668:%.*]] = load <2 x double>, ptr [[VEC_GEP667]], align 8
-; CHECK-NEXT: [[TMP485:%.*]] = getelementptr double, ptr [[B]], i64 36
-; CHECK-NEXT: [[COL_LOAD669:%.*]] = load <2 x double>, ptr [[TMP485]], align 8
-; CHECK-NEXT: [[VEC_GEP670:%.*]] = getelementptr double, ptr [[TMP485]], i64 8
-; CHECK-NEXT: [[COL_LOAD671:%.*]] = load <2 x double>, ptr [[VEC_GEP670]], align 8
-; CHECK-NEXT: [[BLOCK672:%.*]] = shufflevector <2 x double> [[TMP477]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK673:%.*]] = shufflevector <2 x double> [[COL_LOAD666]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP486:%.*]] = extractelement <2 x double> [[COL_LOAD669]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT674:%.*]] = insertelement <2 x double> poison, double [[TMP486]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT675:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT674]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP487:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK673]], <2 x double> [[SPLAT_SPLAT675]], <2 x double> [[BLOCK672]])
-; CHECK-NEXT: [[BLOCK676:%.*]] = shufflevector <2 x double> [[COL_LOAD668]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP488:%.*]] = extractelement <2 x double> [[COL_LOAD669]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT677:%.*]] = insertelement <2 x double> poison, double [[TMP488]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT678:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT677]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP489:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK676]], <2 x double> [[SPLAT_SPLAT678]], <2 x double> [[TMP487]])
-; CHECK-NEXT: [[TMP490:%.*]] = shufflevector <2 x double> [[TMP489]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP491:%.*]] = shufflevector <2 x double> [[TMP477]], <2 x double> [[TMP490]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK679:%.*]] = shufflevector <2 x double> [[TMP483]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK680:%.*]] = shufflevector <2 x double> [[COL_LOAD666]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP492:%.*]] = extractelement <2 x double> [[COL_LOAD671]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT681:%.*]] = insertelement <2 x double> poison, double [[TMP492]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT682:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT681]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP493:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK680]], <2 x double> [[SPLAT_SPLAT682]], <2 x double> [[BLOCK679]])
-; CHECK-NEXT: [[BLOCK683:%.*]] = shufflevector <2 x double> [[COL_LOAD668]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP494:%.*]] = extractelement <2 x double> [[COL_LOAD671]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT684:%.*]] = insertelement <2 x double> poison, double [[TMP494]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT685:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT684]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP495:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK683]], <2 x double> [[SPLAT_SPLAT685]], <2 x double> [[TMP493]])
-; CHECK-NEXT: [[TMP496:%.*]] = shufflevector <2 x double> [[TMP495]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP497:%.*]] = shufflevector <2 x double> [[TMP483]], <2 x double> [[TMP496]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP498:%.*]] = getelementptr double, ptr [[A]], i64 48
-; CHECK-NEXT: [[COL_LOAD686:%.*]] = load <2 x double>, ptr [[TMP498]], align 8
-; CHECK-NEXT: [[VEC_GEP687:%.*]] = getelementptr double, ptr [[TMP498]], i64 8
-; CHECK-NEXT: [[COL_LOAD688:%.*]] = load <2 x double>, ptr [[VEC_GEP687]], align 8
-; CHECK-NEXT: [[TMP499:%.*]] = getelementptr double, ptr [[B]], i64 38
-; CHECK-NEXT: [[COL_LOAD689:%.*]] = load <2 x double>, ptr [[TMP499]], align 8
-; CHECK-NEXT: [[VEC_GEP690:%.*]] = getelementptr double, ptr [[TMP499]], i64 8
-; CHECK-NEXT: [[COL_LOAD691:%.*]] = load <2 x double>, ptr [[VEC_GEP690]], align 8
-; CHECK-NEXT: [[BLOCK692:%.*]] = shufflevector <2 x double> [[TMP491]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK693:%.*]] = shufflevector <2 x double> [[COL_LOAD686]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP500:%.*]] = extractelement <2 x double> [[COL_LOAD689]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT694:%.*]] = insertelement <2 x double> poison, double [[TMP500]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT695:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT694]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP501:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK693]], <2 x double> [[SPLAT_SPLAT695]], <2 x double> [[BLOCK692]])
-; CHECK-NEXT: [[BLOCK696:%.*]] = shufflevector <2 x double> [[COL_LOAD688]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP502:%.*]] = extractelement <2 x double> [[COL_LOAD689]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT697:%.*]] = insertelement <2 x double> poison, double [[TMP502]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT698:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT697]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP503:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK696]], <2 x double> [[SPLAT_SPLAT698]], <2 x double> [[TMP501]])
-; CHECK-NEXT: [[TMP504:%.*]] = shufflevector <2 x double> [[TMP503]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP505:%.*]] = shufflevector <2 x double> [[TMP491]], <2 x double> [[TMP504]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK699:%.*]] = shufflevector <2 x double> [[TMP497]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK700:%.*]] = shufflevector <2 x double> [[COL_LOAD686]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP506:%.*]] = extractelement <2 x double> [[COL_LOAD691]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT701:%.*]] = insertelement <2 x double> poison, double [[TMP506]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT702:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT701]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP507:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK700]], <2 x double> [[SPLAT_SPLAT702]], <2 x double> [[BLOCK699]])
-; CHECK-NEXT: [[BLOCK703:%.*]] = shufflevector <2 x double> [[COL_LOAD688]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP508:%.*]] = extractelement <2 x double> [[COL_LOAD691]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT704:%.*]] = insertelement <2 x double> poison, double [[TMP508]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT705:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT704]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP509:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK703]], <2 x double> [[SPLAT_SPLAT705]], <2 x double> [[TMP507]])
-; CHECK-NEXT: [[TMP510:%.*]] = shufflevector <2 x double> [[TMP509]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP511:%.*]] = shufflevector <2 x double> [[TMP497]], <2 x double> [[TMP510]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP512:%.*]] = getelementptr double, ptr [[C]], i64 32
-; CHECK-NEXT: store <2 x double> [[TMP505]], ptr [[TMP512]], align 8
-; CHECK-NEXT: [[VEC_GEP706:%.*]] = getelementptr double, ptr [[TMP512]], i64 8
-; CHECK-NEXT: store <2 x double> [[TMP511]], ptr [[VEC_GEP706]], align 8
-; CHECK-NEXT: [[TMP513:%.*]] = getelementptr double, ptr [[A]], i64 2
-; CHECK-NEXT: [[COL_LOAD707:%.*]] = load <2 x double>, ptr [[TMP513]], align 8
-; CHECK-NEXT: [[VEC_GEP708:%.*]] = getelementptr double, ptr [[TMP513]], i64 8
-; CHECK-NEXT: [[COL_LOAD709:%.*]] = load <2 x double>, ptr [[VEC_GEP708]], align 8
-; CHECK-NEXT: [[TMP514:%.*]] = getelementptr double, ptr [[B]], i64 32
-; CHECK-NEXT: [[COL_LOAD710:%.*]] = load <2 x double>, ptr [[TMP514]], align 8
-; CHECK-NEXT: [[VEC_GEP711:%.*]] = getelementptr double, ptr [[TMP514]], i64 8
-; CHECK-NEXT: [[COL_LOAD712:%.*]] = load <2 x double>, ptr [[VEC_GEP711]], align 8
-; CHECK-NEXT: [[BLOCK713:%.*]] = shufflevector <2 x double> [[COL_LOAD707]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP515:%.*]] = extractelement <2 x double> [[COL_LOAD710]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT714:%.*]] = insertelement <2 x double> poison, double [[TMP515]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT715:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT714]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP516:%.*]] = fmul contract <2 x double> [[BLOCK713]], [[SPLAT_SPLAT715]]
-; CHECK-NEXT: [[BLOCK716:%.*]] = shufflevector <2 x double> [[COL_LOAD709]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP517:%.*]] = extractelement <2 x double> [[COL_LOAD710]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT717:%.*]] = insertelement <2 x double> poison, double [[TMP517]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT718:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT717]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP518:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK716]], <2 x double> [[SPLAT_SPLAT718]], <2 x double> [[TMP516]])
-; CHECK-NEXT: [[TMP519:%.*]] = shufflevector <2 x double> [[TMP518]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP520:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP519]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK719:%.*]] = shufflevector <2 x double> [[COL_LOAD707]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP521:%.*]] = extractelement <2 x double> [[COL_LOAD712]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT720:%.*]] = insertelement <2 x double> poison, double [[TMP521]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT721:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT720]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP522:%.*]] = fmul contract <2 x double> [[BLOCK719]], [[SPLAT_SPLAT721]]
-; CHECK-NEXT: [[BLOCK722:%.*]] = shufflevector <2 x double> [[COL_LOAD709]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP523:%.*]] = extractelement <2 x double> [[COL_LOAD712]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT723:%.*]] = insertelement <2 x double> poison, double [[TMP523]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT724:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT723]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP524:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK722]], <2 x double> [[SPLAT_SPLAT724]], <2 x double> [[TMP522]])
-; CHECK-NEXT: [[TMP525:%.*]] = shufflevector <2 x double> [[TMP524]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP526:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP525]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP527:%.*]] = getelementptr double, ptr [[A]], i64 18
-; CHECK-NEXT: [[COL_LOAD725:%.*]] = load <2 x double>, ptr [[TMP527]], align 8
-; CHECK-NEXT: [[VEC_GEP726:%.*]] = getelementptr double, ptr [[TMP527]], i64 8
-; CHECK-NEXT: [[COL_LOAD727:%.*]] = load <2 x double>, ptr [[VEC_GEP726]], align 8
-; CHECK-NEXT: [[TMP528:%.*]] = getelementptr double, ptr [[B]], i64 34
-; CHECK-NEXT: [[COL_LOAD728:%.*]] = load <2 x double>, ptr [[TMP528]], align 8
-; CHECK-NEXT: [[VEC_GEP729:%.*]] = getelementptr double, ptr [[TMP528]], i64 8
-; CHECK-NEXT: [[COL_LOAD730:%.*]] = load <2 x double>, ptr [[VEC_GEP729]], align 8
-; CHECK-NEXT: [[BLOCK731:%.*]] = shufflevector <2 x double> [[TMP520]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK732:%.*]] = shufflevector <2 x double> [[COL_LOAD725]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP529:%.*]] = extractelement <2 x double> [[COL_LOAD728]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT733:%.*]] = insertelement <2 x double> poison, double [[TMP529]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT734:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT733]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP530:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK732]], <2 x double> [[SPLAT_SPLAT734]], <2 x double> [[BLOCK731]])
-; CHECK-NEXT: [[BLOCK735:%.*]] = shufflevector <2 x double> [[COL_LOAD727]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP531:%.*]] = extractelement <2 x double> [[COL_LOAD728]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT736:%.*]] = insertelement <2 x double> poison, double [[TMP531]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT737:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT736]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP532:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK735]], <2 x double> [[SPLAT_SPLAT737]], <2 x double> [[TMP530]])
-; CHECK-NEXT: [[TMP533:%.*]] = shufflevector <2 x double> [[TMP532]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP534:%.*]] = shufflevector <2 x double> [[TMP520]], <2 x double> [[TMP533]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK738:%.*]] = shufflevector <2 x double> [[TMP526]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK739:%.*]] = shufflevector <2 x double> [[COL_LOAD725]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP535:%.*]] = extractelement <2 x double> [[COL_LOAD730]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT740:%.*]] = insertelement <2 x double> poison, double [[TMP535]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT741:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT740]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP536:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK739]], <2 x double> [[SPLAT_SPLAT741]], <2 x double> [[BLOCK738]])
-; CHECK-NEXT: [[BLOCK742:%.*]] = shufflevector <2 x double> [[COL_LOAD727]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP537:%.*]] = extractelement <2 x double> [[COL_LOAD730]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT743:%.*]] = insertelement <2 x double> poison, double [[TMP537]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT744:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT743]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP538:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK742]], <2 x double> [[SPLAT_SPLAT744]], <2 x double> [[TMP536]])
-; CHECK-NEXT: [[TMP539:%.*]] = shufflevector <2 x double> [[TMP538]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP540:%.*]] = shufflevector <2 x double> [[TMP526]], <2 x double> [[TMP539]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP541:%.*]] = getelementptr double, ptr [[A]], i64 34
-; CHECK-NEXT: [[COL_LOAD745:%.*]] = load <2 x double>, ptr [[TMP541]], align 8
-; CHECK-NEXT: [[VEC_GEP746:%.*]] = getelementptr double, ptr [[TMP541]], i64 8
-; CHECK-NEXT: [[COL_LOAD747:%.*]] = load <2 x double>, ptr [[VEC_GEP746]], align 8
-; CHECK-NEXT: [[TMP542:%.*]] = getelementptr double, ptr [[B]], i64 36
-; CHECK-NEXT: [[COL_LOAD748:%.*]] = load <2 x double>, ptr [[TMP542]], align 8
-; CHECK-NEXT: [[VEC_GEP749:%.*]] = getelementptr double, ptr [[TMP542]], i64 8
-; CHECK-NEXT: [[COL_LOAD750:%.*]] = load <2 x double>, ptr [[VEC_GEP749]], align 8
-; CHECK-NEXT: [[BLOCK751:%.*]] = shufflevector <2 x double> [[TMP534]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK752:%.*]] = shufflevector <2 x double> [[COL_LOAD745]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP543:%.*]] = extractelement <2 x double> [[COL_LOAD748]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT753:%.*]] = insertelement <2 x double> poison, double [[TMP543]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT754:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT753]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP544:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK752]], <2 x double> [[SPLAT_SPLAT754]], <2 x double> [[BLOCK751]])
-; CHECK-NEXT: [[BLOCK755:%.*]] = shufflevector <2 x double> [[COL_LOAD747]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP545:%.*]] = extractelement <2 x double> [[COL_LOAD748]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT756:%.*]] = insertelement <2 x double> poison, double [[TMP545]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT757:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT756]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP546:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK755]], <2 x double> [[SPLAT_SPLAT757]], <2 x double> [[TMP544]])
-; CHECK-NEXT: [[TMP547:%.*]] = shufflevector <2 x double> [[TMP546]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP548:%.*]] = shufflevector <2 x double> [[TMP534]], <2 x double> [[TMP547]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK758:%.*]] = shufflevector <2 x double> [[TMP540]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK759:%.*]] = shufflevector <2 x double> [[COL_LOAD745]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP549:%.*]] = extractelement <2 x double> [[COL_LOAD750]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT760:%.*]] = insertelement <2 x double> poison, double [[TMP549]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT761:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT760]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP550:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK759]], <2 x double> [[SPLAT_SPLAT761]], <2 x double> [[BLOCK758]])
-; CHECK-NEXT: [[BLOCK762:%.*]] = shufflevector <2 x double> [[COL_LOAD747]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP551:%.*]] = extractelement <2 x double> [[COL_LOAD750]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT763:%.*]] = insertelement <2 x double> poison, double [[TMP551]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT764:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT763]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP552:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK762]], <2 x double> [[SPLAT_SPLAT764]], <2 x double> [[TMP550]])
-; CHECK-NEXT: [[TMP553:%.*]] = shufflevector <2 x double> [[TMP552]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP554:%.*]] = shufflevector <2 x double> [[TMP540]], <2 x double> [[TMP553]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP555:%.*]] = getelementptr double, ptr [[A]], i64 50
-; CHECK-NEXT: [[COL_LOAD765:%.*]] = load <2 x double>, ptr [[TMP555]], align 8
-; CHECK-NEXT: [[VEC_GEP766:%.*]] = getelementptr double, ptr [[TMP555]], i64 8
-; CHECK-NEXT: [[COL_LOAD767:%.*]] = load <2 x double>, ptr [[VEC_GEP766]], align 8
-; CHECK-NEXT: [[TMP556:%.*]] = getelementptr double, ptr [[B]], i64 38
-; CHECK-NEXT: [[COL_LOAD768:%.*]] = load <2 x double>, ptr [[TMP556]], align 8
-; CHECK-NEXT: [[VEC_GEP769:%.*]] = getelementptr double, ptr [[TMP556]], i64 8
-; CHECK-NEXT: [[COL_LOAD770:%.*]] = load <2 x double>, ptr [[VEC_GEP769]], align 8
-; CHECK-NEXT: [[BLOCK771:%.*]] = shufflevector <2 x double> [[TMP548]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK772:%.*]] = shufflevector <2 x double> [[COL_LOAD765]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP557:%.*]] = extractelement <2 x double> [[COL_LOAD768]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT773:%.*]] = insertelement <2 x double> poison, double [[TMP557]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT774:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT773]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP558:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK772]], <2 x double> [[SPLAT_SPLAT774]], <2 x double> [[BLOCK771]])
-; CHECK-NEXT: [[BLOCK775:%.*]] = shufflevector <2 x double> [[COL_LOAD767]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP559:%.*]] = extractelement <2 x double> [[COL_LOAD768]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT776:%.*]] = insertelement <2 x double> poison, double [[TMP559]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT777:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT776]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP560:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK775]], <2 x double> [[SPLAT_SPLAT777]], <2 x double> [[TMP558]])
-; CHECK-NEXT: [[TMP561:%.*]] = shufflevector <2 x double> [[TMP560]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP562:%.*]] = shufflevector <2 x double> [[TMP548]], <2 x double> [[TMP561]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK778:%.*]] = shufflevector <2 x double> [[TMP554]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK779:%.*]] = shufflevector <2 x double> [[COL_LOAD765]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP563:%.*]] = extractelement <2 x double> [[COL_LOAD770]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT780:%.*]] = insertelement <2 x double> poison, double [[TMP563]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT781:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT780]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP564:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK779]], <2 x double> [[SPLAT_SPLAT781]], <2 x double> [[BLOCK778]])
-; CHECK-NEXT: [[BLOCK782:%.*]] = shufflevector <2 x double> [[COL_LOAD767]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP565:%.*]] = extractelement <2 x double> [[COL_LOAD770]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT783:%.*]] = insertelement <2 x double> poison, double [[TMP565]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT784:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT783]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP566:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK782]], <2 x double> [[SPLAT_SPLAT784]], <2 x double> [[TMP564]])
-; CHECK-NEXT: [[TMP567:%.*]] = shufflevector <2 x double> [[TMP566]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP568:%.*]] = shufflevector <2 x double> [[TMP554]], <2 x double> [[TMP567]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP569:%.*]] = getelementptr double, ptr [[C]], i64 34
-; CHECK-NEXT: store <2 x double> [[TMP562]], ptr [[TMP569]], align 8
-; CHECK-NEXT: [[VEC_GEP785:%.*]] = getelementptr double, ptr [[TMP569]], i64 8
-; CHECK-NEXT: store <2 x double> [[TMP568]], ptr [[VEC_GEP785]], align 8
-; CHECK-NEXT: [[TMP570:%.*]] = getelementptr double, ptr [[A]], i64 4
-; CHECK-NEXT: [[COL_LOAD786:%.*]] = load <2 x double>, ptr [[TMP570]], align 8
-; CHECK-NEXT: [[VEC_GEP787:%.*]] = getelementptr double, ptr [[TMP570]], i64 8
-; CHECK-NEXT: [[COL_LOAD788:%.*]] = load <2 x double>, ptr [[VEC_GEP787]], align 8
-; CHECK-NEXT: [[TMP571:%.*]] = getelementptr double, ptr [[B]], i64 32
-; CHECK-NEXT: [[COL_LOAD789:%.*]] = load <2 x double>, ptr [[TMP571]], align 8
-; CHECK-NEXT: [[VEC_GEP790:%.*]] = getelementptr double, ptr [[TMP571]], i64 8
-; CHECK-NEXT: [[COL_LOAD791:%.*]] = load <2 x double>, ptr [[VEC_GEP790]], align 8
-; CHECK-NEXT: [[BLOCK792:%.*]] = shufflevector <2 x double> [[COL_LOAD786]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP572:%.*]] = extractelement <2 x double> [[COL_LOAD789]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT793:%.*]] = insertelement <2 x double> poison, double [[TMP572]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT794:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT793]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP573:%.*]] = fmul contract <2 x double> [[BLOCK792]], [[SPLAT_SPLAT794]]
-; CHECK-NEXT: [[BLOCK795:%.*]] = shufflevector <2 x double> [[COL_LOAD788]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP574:%.*]] = extractelement <2 x double> [[COL_LOAD789]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT796:%.*]] = insertelement <2 x double> poison, double [[TMP574]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT797:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT796]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP575:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK795]], <2 x double> [[SPLAT_SPLAT797]], <2 x double> [[TMP573]])
-; CHECK-NEXT: [[TMP576:%.*]] = shufflevector <2 x double> [[TMP575]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP577:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP576]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK798:%.*]] = shufflevector <2 x double> [[COL_LOAD786]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP578:%.*]] = extractelement <2 x double> [[COL_LOAD791]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT799:%.*]] = insertelement <2 x double> poison, double [[TMP578]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT800:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT799]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP579:%.*]] = fmul contract <2 x double> [[BLOCK798]], [[SPLAT_SPLAT800]]
-; CHECK-NEXT: [[BLOCK801:%.*]] = shufflevector <2 x double> [[COL_LOAD788]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP580:%.*]] = extractelement <2 x double> [[COL_LOAD791]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT802:%.*]] = insertelement <2 x double> poison, double [[TMP580]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT803:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT802]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP581:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK801]], <2 x double> [[SPLAT_SPLAT803]], <2 x double> [[TMP579]])
-; CHECK-NEXT: [[TMP582:%.*]] = shufflevector <2 x double> [[TMP581]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP583:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP582]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP584:%.*]] = getelementptr double, ptr [[A]], i64 20
-; CHECK-NEXT: [[COL_LOAD804:%.*]] = load <2 x double>, ptr [[TMP584]], align 8
-; CHECK-NEXT: [[VEC_GEP805:%.*]] = getelementptr double, ptr [[TMP584]], i64 8
-; CHECK-NEXT: [[COL_LOAD806:%.*]] = load <2 x double>, ptr [[VEC_GEP805]], align 8
-; CHECK-NEXT: [[TMP585:%.*]] = getelementptr double, ptr [[B]], i64 34
-; CHECK-NEXT: [[COL_LOAD807:%.*]] = load <2 x double>, ptr [[TMP585]], align 8
-; CHECK-NEXT: [[VEC_GEP808:%.*]] = getelementptr double, ptr [[TMP585]], i64 8
-; CHECK-NEXT: [[COL_LOAD809:%.*]] = load <2 x double>, ptr [[VEC_GEP808]], align 8
-; CHECK-NEXT: [[BLOCK810:%.*]] = shufflevector <2 x double> [[TMP577]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK811:%.*]] = shufflevector <2 x double> [[COL_LOAD804]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP586:%.*]] = extractelement <2 x double> [[COL_LOAD807]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT812:%.*]] = insertelement <2 x double> poison, double [[TMP586]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT813:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT812]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP587:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK811]], <2 x double> [[SPLAT_SPLAT813]], <2 x double> [[BLOCK810]])
-; CHECK-NEXT: [[BLOCK814:%.*]] = shufflevector <2 x double> [[COL_LOAD806]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP588:%.*]] = extractelement <2 x double> [[COL_LOAD807]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT815:%.*]] = insertelement <2 x double> poison, double [[TMP588]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT816:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT815]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP589:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK814]], <2 x double> [[SPLAT_SPLAT816]], <2 x double> [[TMP587]])
-; CHECK-NEXT: [[TMP590:%.*]] = shufflevector <2 x double> [[TMP589]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP591:%.*]] = shufflevector <2 x double> [[TMP577]], <2 x double> [[TMP590]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK817:%.*]] = shufflevector <2 x double> [[TMP583]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK818:%.*]] = shufflevector <2 x double> [[COL_LOAD804]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP592:%.*]] = extractelement <2 x double> [[COL_LOAD809]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT819:%.*]] = insertelement <2 x double> poison, double [[TMP592]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT820:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT819]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP593:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK818]], <2 x double> [[SPLAT_SPLAT820]], <2 x double> [[BLOCK817]])
-; CHECK-NEXT: [[BLOCK821:%.*]] = shufflevector <2 x double> [[COL_LOAD806]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP594:%.*]] = extractelement <2 x double> [[COL_LOAD809]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT822:%.*]] = insertelement <2 x double> poison, double [[TMP594]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT823:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT822]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP595:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK821]], <2 x double> [[SPLAT_SPLAT823]], <2 x double> [[TMP593]])
-; CHECK-NEXT: [[TMP596:%.*]] = shufflevector <2 x double> [[TMP595]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP597:%.*]] = shufflevector <2 x double> [[TMP583]], <2 x double> [[TMP596]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP598:%.*]] = getelementptr double, ptr [[A]], i64 36
-; CHECK-NEXT: [[COL_LOAD824:%.*]] = load <2 x double>, ptr [[TMP598]], align 8
-; CHECK-NEXT: [[VEC_GEP825:%.*]] = getelementptr double, ptr [[TMP598]], i64 8
-; CHECK-NEXT: [[COL_LOAD826:%.*]] = load <2 x double>, ptr [[VEC_GEP825]], align 8
-; CHECK-NEXT: [[TMP599:%.*]] = getelementptr double, ptr [[B]], i64 36
-; CHECK-NEXT: [[COL_LOAD827:%.*]] = load <2 x double>, ptr [[TMP599]], align 8
-; CHECK-NEXT: [[VEC_GEP828:%.*]] = getelementptr double, ptr [[TMP599]], i64 8
-; CHECK-NEXT: [[COL_LOAD829:%.*]] = load <2 x double>, ptr [[VEC_GEP828]], align 8
-; CHECK-NEXT: [[BLOCK830:%.*]] = shufflevector <2 x double> [[TMP591]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK831:%.*]] = shufflevector <2 x double> [[COL_LOAD824]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP600:%.*]] = extractelement <2 x double> [[COL_LOAD827]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT832:%.*]] = insertelement <2 x double> poison, double [[TMP600]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT833:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT832]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP601:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK831]], <2 x double> [[SPLAT_SPLAT833]], <2 x double> [[BLOCK830]])
-; CHECK-NEXT: [[BLOCK834:%.*]] = shufflevector <2 x double> [[COL_LOAD826]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP602:%.*]] = extractelement <2 x double> [[COL_LOAD827]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT835:%.*]] = insertelement <2 x double> poison, double [[TMP602]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT836:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT835]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP603:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK834]], <2 x double> [[SPLAT_SPLAT836]], <2 x double> [[TMP601]])
-; CHECK-NEXT: [[TMP604:%.*]] = shufflevector <2 x double> [[TMP603]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP605:%.*]] = shufflevector <2 x double> [[TMP591]], <2 x double> [[TMP604]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK837:%.*]] = shufflevector <2 x double> [[TMP597]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK838:%.*]] = shufflevector <2 x double> [[COL_LOAD824]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP606:%.*]] = extractelement <2 x double> [[COL_LOAD829]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT839:%.*]] = insertelement <2 x double> poison, double [[TMP606]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT840:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT839]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP607:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK838]], <2 x double> [[SPLAT_SPLAT840]], <2 x double> [[BLOCK837]])
-; CHECK-NEXT: [[BLOCK841:%.*]] = shufflevector <2 x double> [[COL_LOAD826]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP608:%.*]] = extractelement <2 x double> [[COL_LOAD829]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT842:%.*]] = insertelement <2 x double> poison, double [[TMP608]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT843:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT842]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP609:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK841]], <2 x double> [[SPLAT_SPLAT843]], <2 x double> [[TMP607]])
-; CHECK-NEXT: [[TMP610:%.*]] = shufflevector <2 x double> [[TMP609]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP611:%.*]] = shufflevector <2 x double> [[TMP597]], <2 x double> [[TMP610]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP612:%.*]] = getelementptr double, ptr [[A]], i64 52
-; CHECK-NEXT: [[COL_LOAD844:%.*]] = load <2 x double>, ptr [[TMP612]], align 8
-; CHECK-NEXT: [[VEC_GEP845:%.*]] = getelementptr double, ptr [[TMP612]], i64 8
-; CHECK-NEXT: [[COL_LOAD846:%.*]] = load <2 x double>, ptr [[VEC_GEP845]], align 8
-; CHECK-NEXT: [[TMP613:%.*]] = getelementptr double, ptr [[B]], i64 38
-; CHECK-NEXT: [[COL_LOAD847:%.*]] = load <2 x double>, ptr [[TMP613]], align 8
-; CHECK-NEXT: [[VEC_GEP848:%.*]] = getelementptr double, ptr [[TMP613]], i64 8
-; CHECK-NEXT: [[COL_LOAD849:%.*]] = load <2 x double>, ptr [[VEC_GEP848]], align 8
-; CHECK-NEXT: [[BLOCK850:%.*]] = shufflevector <2 x double> [[TMP605]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK851:%.*]] = shufflevector <2 x double> [[COL_LOAD844]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP614:%.*]] = extractelement <2 x double> [[COL_LOAD847]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT852:%.*]] = insertelement <2 x double> poison, double [[TMP614]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT853:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT852]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP615:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK851]], <2 x double> [[SPLAT_SPLAT853]], <2 x double> [[BLOCK850]])
-; CHECK-NEXT: [[BLOCK854:%.*]] = shufflevector <2 x double> [[COL_LOAD846]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP616:%.*]] = extractelement <2 x double> [[COL_LOAD847]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT855:%.*]] = insertelement <2 x double> poison, double [[TMP616]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT856:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT855]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP617:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK854]], <2 x double> [[SPLAT_SPLAT856]], <2 x double> [[TMP615]])
-; CHECK-NEXT: [[TMP618:%.*]] = shufflevector <2 x double> [[TMP617]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP619:%.*]] = shufflevector <2 x double> [[TMP605]], <2 x double> [[TMP618]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK857:%.*]] = shufflevector <2 x double> [[TMP611]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK858:%.*]] = shufflevector <2 x double> [[COL_LOAD844]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP620:%.*]] = extractelement <2 x double> [[COL_LOAD849]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT859:%.*]] = insertelement <2 x double> poison, double [[TMP620]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT860:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT859]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP621:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK858]], <2 x double> [[SPLAT_SPLAT860]], <2 x double> [[BLOCK857]])
-; CHECK-NEXT: [[BLOCK861:%.*]] = shufflevector <2 x double> [[COL_LOAD846]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP622:%.*]] = extractelement <2 x double> [[COL_LOAD849]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT862:%.*]] = insertelement <2 x double> poison, double [[TMP622]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT863:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT862]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP623:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK861]], <2 x double> [[SPLAT_SPLAT863]], <2 x double> [[TMP621]])
-; CHECK-NEXT: [[TMP624:%.*]] = shufflevector <2 x double> [[TMP623]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP625:%.*]] = shufflevector <2 x double> [[TMP611]], <2 x double> [[TMP624]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP626:%.*]] = getelementptr double, ptr [[C]], i64 36
-; CHECK-NEXT: store <2 x double> [[TMP619]], ptr [[TMP626]], align 8
-; CHECK-NEXT: [[VEC_GEP864:%.*]] = getelementptr double, ptr [[TMP626]], i64 8
-; CHECK-NEXT: store <2 x double> [[TMP625]], ptr [[VEC_GEP864]], align 8
-; CHECK-NEXT: [[TMP627:%.*]] = getelementptr double, ptr [[A]], i64 6
-; CHECK-NEXT: [[COL_LOAD865:%.*]] = load <2 x double>, ptr [[TMP627]], align 8
-; CHECK-NEXT: [[VEC_GEP866:%.*]] = getelementptr double, ptr [[TMP627]], i64 8
-; CHECK-NEXT: [[COL_LOAD867:%.*]] = load <2 x double>, ptr [[VEC_GEP866]], align 8
-; CHECK-NEXT: [[TMP628:%.*]] = getelementptr double, ptr [[B]], i64 32
-; CHECK-NEXT: [[COL_LOAD868:%.*]] = load <2 x double>, ptr [[TMP628]], align 8
-; CHECK-NEXT: [[VEC_GEP869:%.*]] = getelementptr double, ptr [[TMP628]], i64 8
-; CHECK-NEXT: [[COL_LOAD870:%.*]] = load <2 x double>, ptr [[VEC_GEP869]], align 8
-; CHECK-NEXT: [[BLOCK871:%.*]] = shufflevector <2 x double> [[COL_LOAD865]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP629:%.*]] = extractelement <2 x double> [[COL_LOAD868]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT872:%.*]] = insertelement <2 x double> poison, double [[TMP629]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT873:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT872]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP630:%.*]] = fmul contract <2 x double> [[BLOCK871]], [[SPLAT_SPLAT873]]
-; CHECK-NEXT: [[BLOCK874:%.*]] = shufflevector <2 x double> [[COL_LOAD867]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP631:%.*]] = extractelement <2 x double> [[COL_LOAD868]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT875:%.*]] = insertelement <2 x double> poison, double [[TMP631]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT876:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT875]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP632:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK874]], <2 x double> [[SPLAT_SPLAT876]], <2 x double> [[TMP630]])
-; CHECK-NEXT: [[TMP633:%.*]] = shufflevector <2 x double> [[TMP632]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP634:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP633]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK877:%.*]] = shufflevector <2 x double> [[COL_LOAD865]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP635:%.*]] = extractelement <2 x double> [[COL_LOAD870]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT878:%.*]] = insertelement <2 x double> poison, double [[TMP635]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT879:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT878]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP636:%.*]] = fmul contract <2 x double> [[BLOCK877]], [[SPLAT_SPLAT879]]
-; CHECK-NEXT: [[BLOCK880:%.*]] = shufflevector <2 x double> [[COL_LOAD867]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP637:%.*]] = extractelement <2 x double> [[COL_LOAD870]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT881:%.*]] = insertelement <2 x double> poison, double [[TMP637]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT882:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT881]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP638:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK880]], <2 x double> [[SPLAT_SPLAT882]], <2 x double> [[TMP636]])
-; CHECK-NEXT: [[TMP639:%.*]] = shufflevector <2 x double> [[TMP638]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP640:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP639]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP641:%.*]] = getelementptr double, ptr [[A]], i64 22
-; CHECK-NEXT: [[COL_LOAD883:%.*]] = load <2 x double>, ptr [[TMP641]], align 8
-; CHECK-NEXT: [[VEC_GEP884:%.*]] = getelementptr double, ptr [[TMP641]], i64 8
-; CHECK-NEXT: [[COL_LOAD885:%.*]] = load <2 x double>, ptr [[VEC_GEP884]], align 8
-; CHECK-NEXT: [[TMP642:%.*]] = getelementptr double, ptr [[B]], i64 34
-; CHECK-NEXT: [[COL_LOAD886:%.*]] = load <2 x double>, ptr [[TMP642]], align 8
-; CHECK-NEXT: [[VEC_GEP887:%.*]] = getelementptr double, ptr [[TMP642]], i64 8
-; CHECK-NEXT: [[COL_LOAD888:%.*]] = load <2 x double>, ptr [[VEC_GEP887]], align 8
-; CHECK-NEXT: [[BLOCK889:%.*]] = shufflevector <2 x double> [[TMP634]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK890:%.*]] = shufflevector <2 x double> [[COL_LOAD883]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP643:%.*]] = extractelement <2 x double> [[COL_LOAD886]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT891:%.*]] = insertelement <2 x double> poison, double [[TMP643]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT892:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT891]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP644:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK890]], <2 x double> [[SPLAT_SPLAT892]], <2 x double> [[BLOCK889]])
-; CHECK-NEXT: [[BLOCK893:%.*]] = shufflevector <2 x double> [[COL_LOAD885]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP645:%.*]] = extractelement <2 x double> [[COL_LOAD886]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT894:%.*]] = insertelement <2 x double> poison, double [[TMP645]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT895:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT894]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP646:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK893]], <2 x double> [[SPLAT_SPLAT895]], <2 x double> [[TMP644]])
-; CHECK-NEXT: [[TMP647:%.*]] = shufflevector <2 x double> [[TMP646]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP648:%.*]] = shufflevector <2 x double> [[TMP634]], <2 x double> [[TMP647]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK896:%.*]] = shufflevector <2 x double> [[TMP640]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK897:%.*]] = shufflevector <2 x double> [[COL_LOAD883]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP649:%.*]] = extractelement <2 x double> [[COL_LOAD888]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT898:%.*]] = insertelement <2 x double> poison, double [[TMP649]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT899:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT898]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP650:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK897]], <2 x double> [[SPLAT_SPLAT899]], <2 x double> [[BLOCK896]])
-; CHECK-NEXT: [[BLOCK900:%.*]] = shufflevector <2 x double> [[COL_LOAD885]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP651:%.*]] = extractelement <2 x double> [[COL_LOAD888]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT901:%.*]] = insertelement <2 x double> poison, double [[TMP651]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT902:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT901]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP652:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK900]], <2 x double> [[SPLAT_SPLAT902]], <2 x double> [[TMP650]])
-; CHECK-NEXT: [[TMP653:%.*]] = shufflevector <2 x double> [[TMP652]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP654:%.*]] = shufflevector <2 x double> [[TMP640]], <2 x double> [[TMP653]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP655:%.*]] = getelementptr double, ptr [[A]], i64 38
-; CHECK-NEXT: [[COL_LOAD903:%.*]] = load <2 x double>, ptr [[TMP655]], align 8
-; CHECK-NEXT: [[VEC_GEP904:%.*]] = getelementptr double, ptr [[TMP655]], i64 8
-; CHECK-NEXT: [[COL_LOAD905:%.*]] = load <2 x double>, ptr [[VEC_GEP904]], align 8
-; CHECK-NEXT: [[TMP656:%.*]] = getelementptr double, ptr [[B]], i64 36
-; CHECK-NEXT: [[COL_LOAD906:%.*]] = load <2 x double>, ptr [[TMP656]], align 8
-; CHECK-NEXT: [[VEC_GEP907:%.*]] = getelementptr double, ptr [[TMP656]], i64 8
-; CHECK-NEXT: [[COL_LOAD908:%.*]] = load <2 x double>, ptr [[VEC_GEP907]], align 8
-; CHECK-NEXT: [[BLOCK909:%.*]] = shufflevector <2 x double> [[TMP648]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK910:%.*]] = shufflevector <2 x double> [[COL_LOAD903]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP657:%.*]] = extractelement <2 x double> [[COL_LOAD906]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT911:%.*]] = insertelement <2 x double> poison, double [[TMP657]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT912:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT911]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP658:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK910]], <2 x double> [[SPLAT_SPLAT912]], <2 x double> [[BLOCK909]])
-; CHECK-NEXT: [[BLOCK913:%.*]] = shufflevector <2 x double> [[COL_LOAD905]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP659:%.*]] = extractelement <2 x double> [[COL_LOAD906]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT914:%.*]] = insertelement <2 x double> poison, double [[TMP659]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT915:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT914]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP660:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK913]], <2 x double> [[SPLAT_SPLAT915]], <2 x double> [[TMP658]])
-; CHECK-NEXT: [[TMP661:%.*]] = shufflevector <2 x double> [[TMP660]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP662:%.*]] = shufflevector <2 x double> [[TMP648]], <2 x double> [[TMP661]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK916:%.*]] = shufflevector <2 x double> [[TMP654]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK917:%.*]] = shufflevector <2 x double> [[COL_LOAD903]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP663:%.*]] = extractelement <2 x double> [[COL_LOAD908]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT918:%.*]] = insertelement <2 x double> poison, double [[TMP663]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT919:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT918]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP664:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK917]], <2 x double> [[SPLAT_SPLAT919]], <2 x double> [[BLOCK916]])
-; CHECK-NEXT: [[BLOCK920:%.*]] = shufflevector <2 x double> [[COL_LOAD905]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP665:%.*]] = extractelement <2 x double> [[COL_LOAD908]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT921:%.*]] = insertelement <2 x double> poison, double [[TMP665]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT922:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT921]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP666:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK920]], <2 x double> [[SPLAT_SPLAT922]], <2 x double> [[TMP664]])
-; CHECK-NEXT: [[TMP667:%.*]] = shufflevector <2 x double> [[TMP666]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP668:%.*]] = shufflevector <2 x double> [[TMP654]], <2 x double> [[TMP667]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP669:%.*]] = getelementptr double, ptr [[A]], i64 54
-; CHECK-NEXT: [[COL_LOAD923:%.*]] = load <2 x double>, ptr [[TMP669]], align 8
-; CHECK-NEXT: [[VEC_GEP924:%.*]] = getelementptr double, ptr [[TMP669]], i64 8
-; CHECK-NEXT: [[COL_LOAD925:%.*]] = load <2 x double>, ptr [[VEC_GEP924]], align 8
-; CHECK-NEXT: [[TMP670:%.*]] = getelementptr double, ptr [[B]], i64 38
-; CHECK-NEXT: [[COL_LOAD926:%.*]] = load <2 x double>, ptr [[TMP670]], align 8
-; CHECK-NEXT: [[VEC_GEP927:%.*]] = getelementptr double, ptr [[TMP670]], i64 8
-; CHECK-NEXT: [[COL_LOAD928:%.*]] = load <2 x double>, ptr [[VEC_GEP927]], align 8
-; CHECK-NEXT: [[BLOCK929:%.*]] = shufflevector <2 x double> [[TMP662]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK930:%.*]] = shufflevector <2 x double> [[COL_LOAD923]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP671:%.*]] = extractelement <2 x double> [[COL_LOAD926]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT931:%.*]] = insertelement <2 x double> poison, double [[TMP671]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT932:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT931]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP672:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK930]], <2 x double> [[SPLAT_SPLAT932]], <2 x double> [[BLOCK929]])
-; CHECK-NEXT: [[BLOCK933:%.*]] = shufflevector <2 x double> [[COL_LOAD925]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP673:%.*]] = extractelement <2 x double> [[COL_LOAD926]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT934:%.*]] = insertelement <2 x double> poison, double [[TMP673]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT935:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT934]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP674:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK933]], <2 x double> [[SPLAT_SPLAT935]], <2 x double> [[TMP672]])
-; CHECK-NEXT: [[TMP675:%.*]] = shufflevector <2 x double> [[TMP674]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP676:%.*]] = shufflevector <2 x double> [[TMP662]], <2 x double> [[TMP675]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK936:%.*]] = shufflevector <2 x double> [[TMP668]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK937:%.*]] = shufflevector <2 x double> [[COL_LOAD923]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP677:%.*]] = extractelement <2 x double> [[COL_LOAD928]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT938:%.*]] = insertelement <2 x double> poison, double [[TMP677]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT939:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT938]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP678:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK937]], <2 x double> [[SPLAT_SPLAT939]], <2 x double> [[BLOCK936]])
-; CHECK-NEXT: [[BLOCK940:%.*]] = shufflevector <2 x double> [[COL_LOAD925]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP679:%.*]] = extractelement <2 x double> [[COL_LOAD928]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT941:%.*]] = insertelement <2 x double> poison, double [[TMP679]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT942:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT941]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP680:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK940]], <2 x double> [[SPLAT_SPLAT942]], <2 x double> [[TMP678]])
-; CHECK-NEXT: [[TMP681:%.*]] = shufflevector <2 x double> [[TMP680]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP682:%.*]] = shufflevector <2 x double> [[TMP668]], <2 x double> [[TMP681]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP683:%.*]] = getelementptr double, ptr [[C]], i64 38
-; CHECK-NEXT: store <2 x double> [[TMP676]], ptr [[TMP683]], align 8
-; CHECK-NEXT: [[VEC_GEP943:%.*]] = getelementptr double, ptr [[TMP683]], i64 8
-; CHECK-NEXT: store <2 x double> [[TMP682]], ptr [[VEC_GEP943]], align 8
-; CHECK-NEXT: [[TMP684:%.*]] = getelementptr double, ptr [[A]], i64 0
-; CHECK-NEXT: [[COL_LOAD944:%.*]] = load <2 x double>, ptr [[TMP684]], align 8
-; CHECK-NEXT: [[VEC_GEP945:%.*]] = getelementptr double, ptr [[TMP684]], i64 8
-; CHECK-NEXT: [[COL_LOAD946:%.*]] = load <2 x double>, ptr [[VEC_GEP945]], align 8
-; CHECK-NEXT: [[TMP685:%.*]] = getelementptr double, ptr [[B]], i64 48
-; CHECK-NEXT: [[COL_LOAD947:%.*]] = load <2 x double>, ptr [[TMP685]], align 8
-; CHECK-NEXT: [[VEC_GEP948:%.*]] = getelementptr double, ptr [[TMP685]], i64 8
-; CHECK-NEXT: [[COL_LOAD949:%.*]] = load <2 x double>, ptr [[VEC_GEP948]], align 8
-; CHECK-NEXT: [[BLOCK950:%.*]] = shufflevector <2 x double> [[COL_LOAD944]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP686:%.*]] = extractelement <2 x double> [[COL_LOAD947]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT951:%.*]] = insertelement <2 x double> poison, double [[TMP686]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT952:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT951]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP687:%.*]] = fmul contract <2 x double> [[BLOCK950]], [[SPLAT_SPLAT952]]
-; CHECK-NEXT: [[BLOCK953:%.*]] = shufflevector <2 x double> [[COL_LOAD946]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP688:%.*]] = extractelement <2 x double> [[COL_LOAD947]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT954:%.*]] = insertelement <2 x double> poison, double [[TMP688]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT955:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT954]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP689:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK953]], <2 x double> [[SPLAT_SPLAT955]], <2 x double> [[TMP687]])
-; CHECK-NEXT: [[TMP690:%.*]] = shufflevector <2 x double> [[TMP689]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP691:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP690]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK956:%.*]] = shufflevector <2 x double> [[COL_LOAD944]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP692:%.*]] = extractelement <2 x double> [[COL_LOAD949]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT957:%.*]] = insertelement <2 x double> poison, double [[TMP692]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT958:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT957]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP693:%.*]] = fmul contract <2 x double> [[BLOCK956]], [[SPLAT_SPLAT958]]
-; CHECK-NEXT: [[BLOCK959:%.*]] = shufflevector <2 x double> [[COL_LOAD946]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP694:%.*]] = extractelement <2 x double> [[COL_LOAD949]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT960:%.*]] = insertelement <2 x double> poison, double [[TMP694]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT961:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT960]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP695:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK959]], <2 x double> [[SPLAT_SPLAT961]], <2 x double> [[TMP693]])
-; CHECK-NEXT: [[TMP696:%.*]] = shufflevector <2 x double> [[TMP695]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP697:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP696]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP698:%.*]] = getelementptr double, ptr [[A]], i64 16
-; CHECK-NEXT: [[COL_LOAD962:%.*]] = load <2 x double>, ptr [[TMP698]], align 8
-; CHECK-NEXT: [[VEC_GEP963:%.*]] = getelementptr double, ptr [[TMP698]], i64 8
-; CHECK-NEXT: [[COL_LOAD964:%.*]] = load <2 x double>, ptr [[VEC_GEP963]], align 8
-; CHECK-NEXT: [[TMP699:%.*]] = getelementptr double, ptr [[B]], i64 50
-; CHECK-NEXT: [[COL_LOAD965:%.*]] = load <2 x double>, ptr [[TMP699]], align 8
-; CHECK-NEXT: [[VEC_GEP966:%.*]] = getelementptr double, ptr [[TMP699]], i64 8
-; CHECK-NEXT: [[COL_LOAD967:%.*]] = load <2 x double>, ptr [[VEC_GEP966]], align 8
-; CHECK-NEXT: [[BLOCK968:%.*]] = shufflevector <2 x double> [[TMP691]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK969:%.*]] = shufflevector <2 x double> [[COL_LOAD962]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP700:%.*]] = extractelement <2 x double> [[COL_LOAD965]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT970:%.*]] = insertelement <2 x double> poison, double [[TMP700]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT971:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT970]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP701:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK969]], <2 x double> [[SPLAT_SPLAT971]], <2 x double> [[BLOCK968]])
-; CHECK-NEXT: [[BLOCK972:%.*]] = shufflevector <2 x double> [[COL_LOAD964]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP702:%.*]] = extractelement <2 x double> [[COL_LOAD965]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT973:%.*]] = insertelement <2 x double> poison, double [[TMP702]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT974:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT973]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP703:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK972]], <2 x double> [[SPLAT_SPLAT974]], <2 x double> [[TMP701]])
-; CHECK-NEXT: [[TMP704:%.*]] = shufflevector <2 x double> [[TMP703]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP705:%.*]] = shufflevector <2 x double> [[TMP691]], <2 x double> [[TMP704]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK975:%.*]] = shufflevector <2 x double> [[TMP697]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK976:%.*]] = shufflevector <2 x double> [[COL_LOAD962]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP706:%.*]] = extractelement <2 x double> [[COL_LOAD967]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT977:%.*]] = insertelement <2 x double> poison, double [[TMP706]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT978:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT977]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP707:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK976]], <2 x double> [[SPLAT_SPLAT978]], <2 x double> [[BLOCK975]])
-; CHECK-NEXT: [[BLOCK979:%.*]] = shufflevector <2 x double> [[COL_LOAD964]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP708:%.*]] = extractelement <2 x double> [[COL_LOAD967]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT980:%.*]] = insertelement <2 x double> poison, double [[TMP708]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT981:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT980]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP709:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK979]], <2 x double> [[SPLAT_SPLAT981]], <2 x double> [[TMP707]])
-; CHECK-NEXT: [[TMP710:%.*]] = shufflevector <2 x double> [[TMP709]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP711:%.*]] = shufflevector <2 x double> [[TMP697]], <2 x double> [[TMP710]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP712:%.*]] = getelementptr double, ptr [[A]], i64 32
-; CHECK-NEXT: [[COL_LOAD982:%.*]] = load <2 x double>, ptr [[TMP712]], align 8
-; CHECK-NEXT: [[VEC_GEP983:%.*]] = getelementptr double, ptr [[TMP712]], i64 8
-; CHECK-NEXT: [[COL_LOAD984:%.*]] = load <2 x double>, ptr [[VEC_GEP983]], align 8
-; CHECK-NEXT: [[TMP713:%.*]] = getelementptr double, ptr [[B]], i64 52
-; CHECK-NEXT: [[COL_LOAD985:%.*]] = load <2 x double>, ptr [[TMP713]], align 8
-; CHECK-NEXT: [[VEC_GEP986:%.*]] = getelementptr double, ptr [[TMP713]], i64 8
-; CHECK-NEXT: [[COL_LOAD987:%.*]] = load <2 x double>, ptr [[VEC_GEP986]], align 8
-; CHECK-NEXT: [[BLOCK988:%.*]] = shufflevector <2 x double> [[TMP705]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK989:%.*]] = shufflevector <2 x double> [[COL_LOAD982]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP714:%.*]] = extractelement <2 x double> [[COL_LOAD985]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT990:%.*]] = insertelement <2 x double> poison, double [[TMP714]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT991:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT990]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP715:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK989]], <2 x double> [[SPLAT_SPLAT991]], <2 x double> [[BLOCK988]])
-; CHECK-NEXT: [[BLOCK992:%.*]] = shufflevector <2 x double> [[COL_LOAD984]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP716:%.*]] = extractelement <2 x double> [[COL_LOAD985]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT993:%.*]] = insertelement <2 x double> poison, double [[TMP716]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT994:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT993]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP717:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK992]], <2 x double> [[SPLAT_SPLAT994]], <2 x double> [[TMP715]])
-; CHECK-NEXT: [[TMP718:%.*]] = shufflevector <2 x double> [[TMP717]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP719:%.*]] = shufflevector <2 x double> [[TMP705]], <2 x double> [[TMP718]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK995:%.*]] = shufflevector <2 x double> [[TMP711]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK996:%.*]] = shufflevector <2 x double> [[COL_LOAD982]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP720:%.*]] = extractelement <2 x double> [[COL_LOAD987]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT997:%.*]] = insertelement <2 x double> poison, double [[TMP720]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT998:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT997]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP721:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK996]], <2 x double> [[SPLAT_SPLAT998]], <2 x double> [[BLOCK995]])
-; CHECK-NEXT: [[BLOCK999:%.*]] = shufflevector <2 x double> [[COL_LOAD984]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP722:%.*]] = extractelement <2 x double> [[COL_LOAD987]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1000:%.*]] = insertelement <2 x double> poison, double [[TMP722]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1001:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1000]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP723:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK999]], <2 x double> [[SPLAT_SPLAT1001]], <2 x double> [[TMP721]])
-; CHECK-NEXT: [[TMP724:%.*]] = shufflevector <2 x double> [[TMP723]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP725:%.*]] = shufflevector <2 x double> [[TMP711]], <2 x double> [[TMP724]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP726:%.*]] = getelementptr double, ptr [[A]], i64 48
-; CHECK-NEXT: [[COL_LOAD1002:%.*]] = load <2 x double>, ptr [[TMP726]], align 8
-; CHECK-NEXT: [[VEC_GEP1003:%.*]] = getelementptr double, ptr [[TMP726]], i64 8
-; CHECK-NEXT: [[COL_LOAD1004:%.*]] = load <2 x double>, ptr [[VEC_GEP1003]], align 8
-; CHECK-NEXT: [[TMP727:%.*]] = getelementptr double, ptr [[B]], i64 54
-; CHECK-NEXT: [[COL_LOAD1005:%.*]] = load <2 x double>, ptr [[TMP727]], align 8
-; CHECK-NEXT: [[VEC_GEP1006:%.*]] = getelementptr double, ptr [[TMP727]], i64 8
-; CHECK-NEXT: [[COL_LOAD1007:%.*]] = load <2 x double>, ptr [[VEC_GEP1006]], align 8
-; CHECK-NEXT: [[BLOCK1008:%.*]] = shufflevector <2 x double> [[TMP719]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK1009:%.*]] = shufflevector <2 x double> [[COL_LOAD1002]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP728:%.*]] = extractelement <2 x double> [[COL_LOAD1005]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1010:%.*]] = insertelement <2 x double> poison, double [[TMP728]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1011:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1010]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP729:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1009]], <2 x double> [[SPLAT_SPLAT1011]], <2 x double> [[BLOCK1008]])
-; CHECK-NEXT: [[BLOCK1012:%.*]] = shufflevector <2 x double> [[COL_LOAD1004]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP730:%.*]] = extractelement <2 x double> [[COL_LOAD1005]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1013:%.*]] = insertelement <2 x double> poison, double [[TMP730]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1014:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1013]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP731:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1012]], <2 x double> [[SPLAT_SPLAT1014]], <2 x double> [[TMP729]])
-; CHECK-NEXT: [[TMP732:%.*]] = shufflevector <2 x double> [[TMP731]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP733:%.*]] = shufflevector <2 x double> [[TMP719]], <2 x double> [[TMP732]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK1015:%.*]] = shufflevector <2 x double> [[TMP725]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK1016:%.*]] = shufflevector <2 x double> [[COL_LOAD1002]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP734:%.*]] = extractelement <2 x double> [[COL_LOAD1007]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1017:%.*]] = insertelement <2 x double> poison, double [[TMP734]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1018:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1017]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP735:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1016]], <2 x double> [[SPLAT_SPLAT1018]], <2 x double> [[BLOCK1015]])
-; CHECK-NEXT: [[BLOCK1019:%.*]] = shufflevector <2 x double> [[COL_LOAD1004]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP736:%.*]] = extractelement <2 x double> [[COL_LOAD1007]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1020:%.*]] = insertelement <2 x double> poison, double [[TMP736]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1021:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1020]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP737:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1019]], <2 x double> [[SPLAT_SPLAT1021]], <2 x double> [[TMP735]])
-; CHECK-NEXT: [[TMP738:%.*]] = shufflevector <2 x double> [[TMP737]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP739:%.*]] = shufflevector <2 x double> [[TMP725]], <2 x double> [[TMP738]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP740:%.*]] = getelementptr double, ptr [[C]], i64 48
-; CHECK-NEXT: store <2 x double> [[TMP733]], ptr [[TMP740]], align 8
-; CHECK-NEXT: [[VEC_GEP1022:%.*]] = getelementptr double, ptr [[TMP740]], i64 8
-; CHECK-NEXT: store <2 x double> [[TMP739]], ptr [[VEC_GEP1022]], align 8
-; CHECK-NEXT: [[TMP741:%.*]] = getelementptr double, ptr [[A]], i64 2
-; CHECK-NEXT: [[COL_LOAD1023:%.*]] = load <2 x double>, ptr [[TMP741]], align 8
-; CHECK-NEXT: [[VEC_GEP1024:%.*]] = getelementptr double, ptr [[TMP741]], i64 8
-; CHECK-NEXT: [[COL_LOAD1025:%.*]] = load <2 x double>, ptr [[VEC_GEP1024]], align 8
-; CHECK-NEXT: [[TMP742:%.*]] = getelementptr double, ptr [[B]], i64 48
-; CHECK-NEXT: [[COL_LOAD1026:%.*]] = load <2 x double>, ptr [[TMP742]], align 8
-; CHECK-NEXT: [[VEC_GEP1027:%.*]] = getelementptr double, ptr [[TMP742]], i64 8
-; CHECK-NEXT: [[COL_LOAD1028:%.*]] = load <2 x double>, ptr [[VEC_GEP1027]], align 8
-; CHECK-NEXT: [[BLOCK1029:%.*]] = shufflevector <2 x double> [[COL_LOAD1023]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP743:%.*]] = extractelement <2 x double> [[COL_LOAD1026]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1030:%.*]] = insertelement <2 x double> poison, double [[TMP743]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1031:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1030]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP744:%.*]] = fmul contract <2 x double> [[BLOCK1029]], [[SPLAT_SPLAT1031]]
-; CHECK-NEXT: [[BLOCK1032:%.*]] = shufflevector <2 x double> [[COL_LOAD1025]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP745:%.*]] = extractelement <2 x double> [[COL_LOAD1026]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1033:%.*]] = insertelement <2 x double> poison, double [[TMP745]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1034:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1033]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP746:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1032]], <2 x double> [[SPLAT_SPLAT1034]], <2 x double> [[TMP744]])
-; CHECK-NEXT: [[TMP747:%.*]] = shufflevector <2 x double> [[TMP746]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP748:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP747]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK1035:%.*]] = shufflevector <2 x double> [[COL_LOAD1023]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP749:%.*]] = extractelement <2 x double> [[COL_LOAD1028]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1036:%.*]] = insertelement <2 x double> poison, double [[TMP749]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1037:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1036]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP750:%.*]] = fmul contract <2 x double> [[BLOCK1035]], [[SPLAT_SPLAT1037]]
-; CHECK-NEXT: [[BLOCK1038:%.*]] = shufflevector <2 x double> [[COL_LOAD1025]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP751:%.*]] = extractelement <2 x double> [[COL_LOAD1028]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1039:%.*]] = insertelement <2 x double> poison, double [[TMP751]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1040:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1039]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP752:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1038]], <2 x double> [[SPLAT_SPLAT1040]], <2 x double> [[TMP750]])
-; CHECK-NEXT: [[TMP753:%.*]] = shufflevector <2 x double> [[TMP752]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP754:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP753]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP755:%.*]] = getelementptr double, ptr [[A]], i64 18
-; CHECK-NEXT: [[COL_LOAD1041:%.*]] = load <2 x double>, ptr [[TMP755]], align 8
-; CHECK-NEXT: [[VEC_GEP1042:%.*]] = getelementptr double, ptr [[TMP755]], i64 8
-; CHECK-NEXT: [[COL_LOAD1043:%.*]] = load <2 x double>, ptr [[VEC_GEP1042]], align 8
-; CHECK-NEXT: [[TMP756:%.*]] = getelementptr double, ptr [[B]], i64 50
-; CHECK-NEXT: [[COL_LOAD1044:%.*]] = load <2 x double>, ptr [[TMP756]], align 8
-; CHECK-NEXT: [[VEC_GEP1045:%.*]] = getelementptr double, ptr [[TMP756]], i64 8
-; CHECK-NEXT: [[COL_LOAD1046:%.*]] = load <2 x double>, ptr [[VEC_GEP1045]], align 8
-; CHECK-NEXT: [[BLOCK1047:%.*]] = shufflevector <2 x double> [[TMP748]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK1048:%.*]] = shufflevector <2 x double> [[COL_LOAD1041]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP757:%.*]] = extractelement <2 x double> [[COL_LOAD1044]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1049:%.*]] = insertelement <2 x double> poison, double [[TMP757]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1050:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1049]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP758:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1048]], <2 x double> [[SPLAT_SPLAT1050]], <2 x double> [[BLOCK1047]])
-; CHECK-NEXT: [[BLOCK1051:%.*]] = shufflevector <2 x double> [[COL_LOAD1043]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP759:%.*]] = extractelement <2 x double> [[COL_LOAD1044]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1052:%.*]] = insertelement <2 x double> poison, double [[TMP759]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1053:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1052]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP760:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1051]], <2 x double> [[SPLAT_SPLAT1053]], <2 x double> [[TMP758]])
-; CHECK-NEXT: [[TMP761:%.*]] = shufflevector <2 x double> [[TMP760]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP762:%.*]] = shufflevector <2 x double> [[TMP748]], <2 x double> [[TMP761]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK1054:%.*]] = shufflevector <2 x double> [[TMP754]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK1055:%.*]] = shufflevector <2 x double> [[COL_LOAD1041]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP763:%.*]] = extractelement <2 x double> [[COL_LOAD1046]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1056:%.*]] = insertelement <2 x double> poison, double [[TMP763]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1057:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1056]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP764:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1055]], <2 x double> [[SPLAT_SPLAT1057]], <2 x double> [[BLOCK1054]])
-; CHECK-NEXT: [[BLOCK1058:%.*]] = shufflevector <2 x double> [[COL_LOAD1043]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP765:%.*]] = extractelement <2 x double> [[COL_LOAD1046]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1059:%.*]] = insertelement <2 x double> poison, double [[TMP765]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1060:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1059]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP766:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1058]], <2 x double> [[SPLAT_SPLAT1060]], <2 x double> [[TMP764]])
-; CHECK-NEXT: [[TMP767:%.*]] = shufflevector <2 x double> [[TMP766]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP768:%.*]] = shufflevector <2 x double> [[TMP754]], <2 x double> [[TMP767]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP769:%.*]] = getelementptr double, ptr [[A]], i64 34
-; CHECK-NEXT: [[COL_LOAD1061:%.*]] = load <2 x double>, ptr [[TMP769]], align 8
-; CHECK-NEXT: [[VEC_GEP1062:%.*]] = getelementptr double, ptr [[TMP769]], i64 8
-; CHECK-NEXT: [[COL_LOAD1063:%.*]] = load <2 x double>, ptr [[VEC_GEP1062]], align 8
-; CHECK-NEXT: [[TMP770:%.*]] = getelementptr double, ptr [[B]], i64 52
-; CHECK-NEXT: [[COL_LOAD1064:%.*]] = load <2 x double>, ptr [[TMP770]], align 8
-; CHECK-NEXT: [[VEC_GEP1065:%.*]] = getelementptr double, ptr [[TMP770]], i64 8
-; CHECK-NEXT: [[COL_LOAD1066:%.*]] = load <2 x double>, ptr [[VEC_GEP1065]], align 8
-; CHECK-NEXT: [[BLOCK1067:%.*]] = shufflevector <2 x double> [[TMP762]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK1068:%.*]] = shufflevector <2 x double> [[COL_LOAD1061]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP771:%.*]] = extractelement <2 x double> [[COL_LOAD1064]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1069:%.*]] = insertelement <2 x double> poison, double [[TMP771]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1070:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1069]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP772:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1068]], <2 x double> [[SPLAT_SPLAT1070]], <2 x double> [[BLOCK1067]])
-; CHECK-NEXT: [[BLOCK1071:%.*]] = shufflevector <2 x double> [[COL_LOAD1063]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP773:%.*]] = extractelement <2 x double> [[COL_LOAD1064]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1072:%.*]] = insertelement <2 x double> poison, double [[TMP773]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1073:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1072]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP774:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1071]], <2 x double> [[SPLAT_SPLAT1073]], <2 x double> [[TMP772]])
-; CHECK-NEXT: [[TMP775:%.*]] = shufflevector <2 x double> [[TMP774]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP776:%.*]] = shufflevector <2 x double> [[TMP762]], <2 x double> [[TMP775]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK1074:%.*]] = shufflevector <2 x double> [[TMP768]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK1075:%.*]] = shufflevector <2 x double> [[COL_LOAD1061]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP777:%.*]] = extractelement <2 x double> [[COL_LOAD1066]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1076:%.*]] = insertelement <2 x double> poison, double [[TMP777]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1077:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1076]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP778:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1075]], <2 x double> [[SPLAT_SPLAT1077]], <2 x double> [[BLOCK1074]])
-; CHECK-NEXT: [[BLOCK1078:%.*]] = shufflevector <2 x double> [[COL_LOAD1063]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP779:%.*]] = extractelement <2 x double> [[COL_LOAD1066]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1079:%.*]] = insertelement <2 x double> poison, double [[TMP779]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1080:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1079]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP780:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1078]], <2 x double> [[SPLAT_SPLAT1080]], <2 x double> [[TMP778]])
-; CHECK-NEXT: [[TMP781:%.*]] = shufflevector <2 x double> [[TMP780]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP782:%.*]] = shufflevector <2 x double> [[TMP768]], <2 x double> [[TMP781]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP783:%.*]] = getelementptr double, ptr [[A]], i64 50
-; CHECK-NEXT: [[COL_LOAD1081:%.*]] = load <2 x double>, ptr [[TMP783]], align 8
-; CHECK-NEXT: [[VEC_GEP1082:%.*]] = getelementptr double, ptr [[TMP783]], i64 8
-; CHECK-NEXT: [[COL_LOAD1083:%.*]] = load <2 x double>, ptr [[VEC_GEP1082]], align 8
-; CHECK-NEXT: [[TMP784:%.*]] = getelementptr double, ptr [[B]], i64 54
-; CHECK-NEXT: [[COL_LOAD1084:%.*]] = load <2 x double>, ptr [[TMP784]], align 8
-; CHECK-NEXT: [[VEC_GEP1085:%.*]] = getelementptr double, ptr [[TMP784]], i64 8
-; CHECK-NEXT: [[COL_LOAD1086:%.*]] = load <2 x double>, ptr [[VEC_GEP1085]], align 8
-; CHECK-NEXT: [[BLOCK1087:%.*]] = shufflevector <2 x double> [[TMP776]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK1088:%.*]] = shufflevector <2 x double> [[COL_LOAD1081]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP785:%.*]] = extractelement <2 x double> [[COL_LOAD1084]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1089:%.*]] = insertelement <2 x double> poison, double [[TMP785]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1090:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1089]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP786:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1088]], <2 x double> [[SPLAT_SPLAT1090]], <2 x double> [[BLOCK1087]])
-; CHECK-NEXT: [[BLOCK1091:%.*]] = shufflevector <2 x double> [[COL_LOAD1083]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP787:%.*]] = extractelement <2 x double> [[COL_LOAD1084]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1092:%.*]] = insertelement <2 x double> poison, double [[TMP787]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1093:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1092]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP788:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1091]], <2 x double> [[SPLAT_SPLAT1093]], <2 x double> [[TMP786]])
-; CHECK-NEXT: [[TMP789:%.*]] = shufflevector <2 x double> [[TMP788]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP790:%.*]] = shufflevector <2 x double> [[TMP776]], <2 x double> [[TMP789]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK1094:%.*]] = shufflevector <2 x double> [[TMP782]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK1095:%.*]] = shufflevector <2 x double> [[COL_LOAD1081]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP791:%.*]] = extractelement <2 x double> [[COL_LOAD1086]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1096:%.*]] = insertelement <2 x double> poison, double [[TMP791]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1097:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1096]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP792:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1095]], <2 x double> [[SPLAT_SPLAT1097]], <2 x double> [[BLOCK1094]])
-; CHECK-NEXT: [[BLOCK1098:%.*]] = shufflevector <2 x double> [[COL_LOAD1083]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP793:%.*]] = extractelement <2 x double> [[COL_LOAD1086]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1099:%.*]] = insertelement <2 x double> poison, double [[TMP793]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1100:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1099]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP794:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1098]], <2 x double> [[SPLAT_SPLAT1100]], <2 x double> [[TMP792]])
-; CHECK-NEXT: [[TMP795:%.*]] = shufflevector <2 x double> [[TMP794]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP796:%.*]] = shufflevector <2 x double> [[TMP782]], <2 x double> [[TMP795]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP797:%.*]] = getelementptr double, ptr [[C]], i64 50
-; CHECK-NEXT: store <2 x double> [[TMP790]], ptr [[TMP797]], align 8
-; CHECK-NEXT: [[VEC_GEP1101:%.*]] = getelementptr double, ptr [[TMP797]], i64 8
-; CHECK-NEXT: store <2 x double> [[TMP796]], ptr [[VEC_GEP1101]], align 8
-; CHECK-NEXT: [[TMP798:%.*]] = getelementptr double, ptr [[A]], i64 4
-; CHECK-NEXT: [[COL_LOAD1102:%.*]] = load <2 x double>, ptr [[TMP798]], align 8
-; CHECK-NEXT: [[VEC_GEP1103:%.*]] = getelementptr double, ptr [[TMP798]], i64 8
-; CHECK-NEXT: [[COL_LOAD1104:%.*]] = load <2 x double>, ptr [[VEC_GEP1103]], align 8
-; CHECK-NEXT: [[TMP799:%.*]] = getelementptr double, ptr [[B]], i64 48
-; CHECK-NEXT: [[COL_LOAD1105:%.*]] = load <2 x double>, ptr [[TMP799]], align 8
-; CHECK-NEXT: [[VEC_GEP1106:%.*]] = getelementptr double, ptr [[TMP799]], i64 8
-; CHECK-NEXT: [[COL_LOAD1107:%.*]] = load <2 x double>, ptr [[VEC_GEP1106]], align 8
-; CHECK-NEXT: [[BLOCK1108:%.*]] = shufflevector <2 x double> [[COL_LOAD1102]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP800:%.*]] = extractelement <2 x double> [[COL_LOAD1105]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1109:%.*]] = insertelement <2 x double> poison, double [[TMP800]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1110:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1109]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP801:%.*]] = fmul contract <2 x double> [[BLOCK1108]], [[SPLAT_SPLAT1110]]
-; CHECK-NEXT: [[BLOCK1111:%.*]] = shufflevector <2 x double> [[COL_LOAD1104]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP802:%.*]] = extractelement <2 x double> [[COL_LOAD1105]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1112:%.*]] = insertelement <2 x double> poison, double [[TMP802]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1113:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1112]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP803:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1111]], <2 x double> [[SPLAT_SPLAT1113]], <2 x double> [[TMP801]])
-; CHECK-NEXT: [[TMP804:%.*]] = shufflevector <2 x double> [[TMP803]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP805:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP804]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK1114:%.*]] = shufflevector <2 x double> [[COL_LOAD1102]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP806:%.*]] = extractelement <2 x double> [[COL_LOAD1107]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1115:%.*]] = insertelement <2 x double> poison, double [[TMP806]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1116:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1115]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP807:%.*]] = fmul contract <2 x double> [[BLOCK1114]], [[SPLAT_SPLAT1116]]
-; CHECK-NEXT: [[BLOCK1117:%.*]] = shufflevector <2 x double> [[COL_LOAD1104]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP808:%.*]] = extractelement <2 x double> [[COL_LOAD1107]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1118:%.*]] = insertelement <2 x double> poison, double [[TMP808]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1119:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1118]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP809:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1117]], <2 x double> [[SPLAT_SPLAT1119]], <2 x double> [[TMP807]])
-; CHECK-NEXT: [[TMP810:%.*]] = shufflevector <2 x double> [[TMP809]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP811:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP810]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP812:%.*]] = getelementptr double, ptr [[A]], i64 20
-; CHECK-NEXT: [[COL_LOAD1120:%.*]] = load <2 x double>, ptr [[TMP812]], align 8
-; CHECK-NEXT: [[VEC_GEP1121:%.*]] = getelementptr double, ptr [[TMP812]], i64 8
-; CHECK-NEXT: [[COL_LOAD1122:%.*]] = load <2 x double>, ptr [[VEC_GEP1121]], align 8
-; CHECK-NEXT: [[TMP813:%.*]] = getelementptr double, ptr [[B]], i64 50
-; CHECK-NEXT: [[COL_LOAD1123:%.*]] = load <2 x double>, ptr [[TMP813]], align 8
-; CHECK-NEXT: [[VEC_GEP1124:%.*]] = getelementptr double, ptr [[TMP813]], i64 8
-; CHECK-NEXT: [[COL_LOAD1125:%.*]] = load <2 x double>, ptr [[VEC_GEP1124]], align 8
-; CHECK-NEXT: [[BLOCK1126:%.*]] = shufflevector <2 x double> [[TMP805]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK1127:%.*]] = shufflevector <2 x double> [[COL_LOAD1120]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP814:%.*]] = extractelement <2 x double> [[COL_LOAD1123]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1128:%.*]] = insertelement <2 x double> poison, double [[TMP814]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1129:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1128]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP815:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1127]], <2 x double> [[SPLAT_SPLAT1129]], <2 x double> [[BLOCK1126]])
-; CHECK-NEXT: [[BLOCK1130:%.*]] = shufflevector <2 x double> [[COL_LOAD1122]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP816:%.*]] = extractelement <2 x double> [[COL_LOAD1123]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1131:%.*]] = insertelement <2 x double> poison, double [[TMP816]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1132:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1131]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP817:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1130]], <2 x double> [[SPLAT_SPLAT1132]], <2 x double> [[TMP815]])
-; CHECK-NEXT: [[TMP818:%.*]] = shufflevector <2 x double> [[TMP817]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP819:%.*]] = shufflevector <2 x double> [[TMP805]], <2 x double> [[TMP818]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK1133:%.*]] = shufflevector <2 x double> [[TMP811]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK1134:%.*]] = shufflevector <2 x double> [[COL_LOAD1120]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP820:%.*]] = extractelement <2 x double> [[COL_LOAD1125]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1135:%.*]] = insertelement <2 x double> poison, double [[TMP820]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1136:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1135]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP821:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1134]], <2 x double> [[SPLAT_SPLAT1136]], <2 x double> [[BLOCK1133]])
-; CHECK-NEXT: [[BLOCK1137:%.*]] = shufflevector <2 x double> [[COL_LOAD1122]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP822:%.*]] = extractelement <2 x double> [[COL_LOAD1125]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1138:%.*]] = insertelement <2 x double> poison, double [[TMP822]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1139:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1138]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP823:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1137]], <2 x double> [[SPLAT_SPLAT1139]], <2 x double> [[TMP821]])
-; CHECK-NEXT: [[TMP824:%.*]] = shufflevector <2 x double> [[TMP823]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP825:%.*]] = shufflevector <2 x double> [[TMP811]], <2 x double> [[TMP824]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP826:%.*]] = getelementptr double, ptr [[A]], i64 36
-; CHECK-NEXT: [[COL_LOAD1140:%.*]] = load <2 x double>, ptr [[TMP826]], align 8
-; CHECK-NEXT: [[VEC_GEP1141:%.*]] = getelementptr double, ptr [[TMP826]], i64 8
-; CHECK-NEXT: [[COL_LOAD1142:%.*]] = load <2 x double>, ptr [[VEC_GEP1141]], align 8
-; CHECK-NEXT: [[TMP827:%.*]] = getelementptr double, ptr [[B]], i64 52
-; CHECK-NEXT: [[COL_LOAD1143:%.*]] = load <2 x double>, ptr [[TMP827]], align 8
-; CHECK-NEXT: [[VEC_GEP1144:%.*]] = getelementptr double, ptr [[TMP827]], i64 8
-; CHECK-NEXT: [[COL_LOAD1145:%.*]] = load <2 x double>, ptr [[VEC_GEP1144]], align 8
-; CHECK-NEXT: [[BLOCK1146:%.*]] = shufflevector <2 x double> [[TMP819]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK1147:%.*]] = shufflevector <2 x double> [[COL_LOAD1140]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP828:%.*]] = extractelement <2 x double> [[COL_LOAD1143]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1148:%.*]] = insertelement <2 x double> poison, double [[TMP828]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1149:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1148]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP829:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1147]], <2 x double> [[SPLAT_SPLAT1149]], <2 x double> [[BLOCK1146]])
-; CHECK-NEXT: [[BLOCK1150:%.*]] = shufflevector <2 x double> [[COL_LOAD1142]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP830:%.*]] = extractelement <2 x double> [[COL_LOAD1143]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1151:%.*]] = insertelement <2 x double> poison, double [[TMP830]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1152:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1151]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP831:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1150]], <2 x double> [[SPLAT_SPLAT1152]], <2 x double> [[TMP829]])
-; CHECK-NEXT: [[TMP832:%.*]] = shufflevector <2 x double> [[TMP831]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP833:%.*]] = shufflevector <2 x double> [[TMP819]], <2 x double> [[TMP832]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK1153:%.*]] = shufflevector <2 x double> [[TMP825]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK1154:%.*]] = shufflevector <2 x double> [[COL_LOAD1140]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP834:%.*]] = extractelement <2 x double> [[COL_LOAD1145]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1155:%.*]] = insertelement <2 x double> poison, double [[TMP834]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1156:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1155]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP835:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1154]], <2 x double> [[SPLAT_SPLAT1156]], <2 x double> [[BLOCK1153]])
-; CHECK-NEXT: [[BLOCK1157:%.*]] = shufflevector <2 x double> [[COL_LOAD1142]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP836:%.*]] = extractelement <2 x double> [[COL_LOAD1145]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1158:%.*]] = insertelement <2 x double> poison, double [[TMP836]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1159:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1158]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP837:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1157]], <2 x double> [[SPLAT_SPLAT1159]], <2 x double> [[TMP835]])
-; CHECK-NEXT: [[TMP838:%.*]] = shufflevector <2 x double> [[TMP837]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP839:%.*]] = shufflevector <2 x double> [[TMP825]], <2 x double> [[TMP838]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP840:%.*]] = getelementptr double, ptr [[A]], i64 52
-; CHECK-NEXT: [[COL_LOAD1160:%.*]] = load <2 x double>, ptr [[TMP840]], align 8
-; CHECK-NEXT: [[VEC_GEP1161:%.*]] = getelementptr double, ptr [[TMP840]], i64 8
-; CHECK-NEXT: [[COL_LOAD1162:%.*]] = load <2 x double>, ptr [[VEC_GEP1161]], align 8
-; CHECK-NEXT: [[TMP841:%.*]] = getelementptr double, ptr [[B]], i64 54
-; CHECK-NEXT: [[COL_LOAD1163:%.*]] = load <2 x double>, ptr [[TMP841]], align 8
-; CHECK-NEXT: [[VEC_GEP1164:%.*]] = getelementptr double, ptr [[TMP841]], i64 8
-; CHECK-NEXT: [[COL_LOAD1165:%.*]] = load <2 x double>, ptr [[VEC_GEP1164]], align 8
-; CHECK-NEXT: [[BLOCK1166:%.*]] = shufflevector <2 x double> [[TMP833]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK1167:%.*]] = shufflevector <2 x double> [[COL_LOAD1160]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP842:%.*]] = extractelement <2 x double> [[COL_LOAD1163]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1168:%.*]] = insertelement <2 x double> poison, double [[TMP842]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1169:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1168]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP843:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1167]], <2 x double> [[SPLAT_SPLAT1169]], <2 x double> [[BLOCK1166]])
-; CHECK-NEXT: [[BLOCK1170:%.*]] = shufflevector <2 x double> [[COL_LOAD1162]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP844:%.*]] = extractelement <2 x double> [[COL_LOAD1163]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1171:%.*]] = insertelement <2 x double> poison, double [[TMP844]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1172:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1171]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP845:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1170]], <2 x double> [[SPLAT_SPLAT1172]], <2 x double> [[TMP843]])
-; CHECK-NEXT: [[TMP846:%.*]] = shufflevector <2 x double> [[TMP845]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP847:%.*]] = shufflevector <2 x double> [[TMP833]], <2 x double> [[TMP846]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK1173:%.*]] = shufflevector <2 x double> [[TMP839]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK1174:%.*]] = shufflevector <2 x double> [[COL_LOAD1160]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP848:%.*]] = extractelement <2 x double> [[COL_LOAD1165]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1175:%.*]] = insertelement <2 x double> poison, double [[TMP848]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1176:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1175]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP849:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1174]], <2 x double> [[SPLAT_SPLAT1176]], <2 x double> [[BLOCK1173]])
-; CHECK-NEXT: [[BLOCK1177:%.*]] = shufflevector <2 x double> [[COL_LOAD1162]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP850:%.*]] = extractelement <2 x double> [[COL_LOAD1165]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1178:%.*]] = insertelement <2 x double> poison, double [[TMP850]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1179:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1178]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP851:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1177]], <2 x double> [[SPLAT_SPLAT1179]], <2 x double> [[TMP849]])
-; CHECK-NEXT: [[TMP852:%.*]] = shufflevector <2 x double> [[TMP851]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP853:%.*]] = shufflevector <2 x double> [[TMP839]], <2 x double> [[TMP852]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP854:%.*]] = getelementptr double, ptr [[C]], i64 52
-; CHECK-NEXT: store <2 x double> [[TMP847]], ptr [[TMP854]], align 8
-; CHECK-NEXT: [[VEC_GEP1180:%.*]] = getelementptr double, ptr [[TMP854]], i64 8
-; CHECK-NEXT: store <2 x double> [[TMP853]], ptr [[VEC_GEP1180]], align 8
-; CHECK-NEXT: [[TMP855:%.*]] = getelementptr double, ptr [[A]], i64 6
-; CHECK-NEXT: [[COL_LOAD1181:%.*]] = load <2 x double>, ptr [[TMP855]], align 8
-; CHECK-NEXT: [[VEC_GEP1182:%.*]] = getelementptr double, ptr [[TMP855]], i64 8
-; CHECK-NEXT: [[COL_LOAD1183:%.*]] = load <2 x double>, ptr [[VEC_GEP1182]], align 8
-; CHECK-NEXT: [[TMP856:%.*]] = getelementptr double, ptr [[B]], i64 48
-; CHECK-NEXT: [[COL_LOAD1184:%.*]] = load <2 x double>, ptr [[TMP856]], align 8
-; CHECK-NEXT: [[VEC_GEP1185:%.*]] = getelementptr double, ptr [[TMP856]], i64 8
-; CHECK-NEXT: [[COL_LOAD1186:%.*]] = load <2 x double>, ptr [[VEC_GEP1185]], align 8
-; CHECK-NEXT: [[BLOCK1187:%.*]] = shufflevector <2 x double> [[COL_LOAD1181]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP857:%.*]] = extractelement <2 x double> [[COL_LOAD1184]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1188:%.*]] = insertelement <2 x double> poison, double [[TMP857]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1189:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1188]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP858:%.*]] = fmul contract <2 x double> [[BLOCK1187]], [[SPLAT_SPLAT1189]]
-; CHECK-NEXT: [[BLOCK1190:%.*]] = shufflevector <2 x double> [[COL_LOAD1183]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP859:%.*]] = extractelement <2 x double> [[COL_LOAD1184]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1191:%.*]] = insertelement <2 x double> poison, double [[TMP859]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1192:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1191]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP860:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1190]], <2 x double> [[SPLAT_SPLAT1192]], <2 x double> [[TMP858]])
-; CHECK-NEXT: [[TMP861:%.*]] = shufflevector <2 x double> [[TMP860]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP862:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP861]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK1193:%.*]] = shufflevector <2 x double> [[COL_LOAD1181]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP863:%.*]] = extractelement <2 x double> [[COL_LOAD1186]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1194:%.*]] = insertelement <2 x double> poison, double [[TMP863]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1195:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1194]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP864:%.*]] = fmul contract <2 x double> [[BLOCK1193]], [[SPLAT_SPLAT1195]]
-; CHECK-NEXT: [[BLOCK1196:%.*]] = shufflevector <2 x double> [[COL_LOAD1183]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP865:%.*]] = extractelement <2 x double> [[COL_LOAD1186]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1197:%.*]] = insertelement <2 x double> poison, double [[TMP865]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1198:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1197]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP866:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1196]], <2 x double> [[SPLAT_SPLAT1198]], <2 x double> [[TMP864]])
-; CHECK-NEXT: [[TMP867:%.*]] = shufflevector <2 x double> [[TMP866]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP868:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP867]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP869:%.*]] = getelementptr double, ptr [[A]], i64 22
-; CHECK-NEXT: [[COL_LOAD1199:%.*]] = load <2 x double>, ptr [[TMP869]], align 8
-; CHECK-NEXT: [[VEC_GEP1200:%.*]] = getelementptr double, ptr [[TMP869]], i64 8
-; CHECK-NEXT: [[COL_LOAD1201:%.*]] = load <2 x double>, ptr [[VEC_GEP1200]], align 8
-; CHECK-NEXT: [[TMP870:%.*]] = getelementptr double, ptr [[B]], i64 50
-; CHECK-NEXT: [[COL_LOAD1202:%.*]] = load <2 x double>, ptr [[TMP870]], align 8
-; CHECK-NEXT: [[VEC_GEP1203:%.*]] = getelementptr double, ptr [[TMP870]], i64 8
-; CHECK-NEXT: [[COL_LOAD1204:%.*]] = load <2 x double>, ptr [[VEC_GEP1203]], align 8
-; CHECK-NEXT: [[BLOCK1205:%.*]] = shufflevector <2 x double> [[TMP862]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK1206:%.*]] = shufflevector <2 x double> [[COL_LOAD1199]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP871:%.*]] = extractelement <2 x double> [[COL_LOAD1202]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1207:%.*]] = insertelement <2 x double> poison, double [[TMP871]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1208:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1207]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP872:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1206]], <2 x double> [[SPLAT_SPLAT1208]], <2 x double> [[BLOCK1205]])
-; CHECK-NEXT: [[BLOCK1209:%.*]] = shufflevector <2 x double> [[COL_LOAD1201]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP873:%.*]] = extractelement <2 x double> [[COL_LOAD1202]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1210:%.*]] = insertelement <2 x double> poison, double [[TMP873]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1211:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1210]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP874:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1209]], <2 x double> [[SPLAT_SPLAT1211]], <2 x double> [[TMP872]])
-; CHECK-NEXT: [[TMP875:%.*]] = shufflevector <2 x double> [[TMP874]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP876:%.*]] = shufflevector <2 x double> [[TMP862]], <2 x double> [[TMP875]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK1212:%.*]] = shufflevector <2 x double> [[TMP868]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK1213:%.*]] = shufflevector <2 x double> [[COL_LOAD1199]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP877:%.*]] = extractelement <2 x double> [[COL_LOAD1204]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1214:%.*]] = insertelement <2 x double> poison, double [[TMP877]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1215:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1214]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP878:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1213]], <2 x double> [[SPLAT_SPLAT1215]], <2 x double> [[BLOCK1212]])
-; CHECK-NEXT: [[BLOCK1216:%.*]] = shufflevector <2 x double> [[COL_LOAD1201]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP879:%.*]] = extractelement <2 x double> [[COL_LOAD1204]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1217:%.*]] = insertelement <2 x double> poison, double [[TMP879]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1218:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1217]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP880:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1216]], <2 x double> [[SPLAT_SPLAT1218]], <2 x double> [[TMP878]])
-; CHECK-NEXT: [[TMP881:%.*]] = shufflevector <2 x double> [[TMP880]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP882:%.*]] = shufflevector <2 x double> [[TMP868]], <2 x double> [[TMP881]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP883:%.*]] = getelementptr double, ptr [[A]], i64 38
-; CHECK-NEXT: [[COL_LOAD1219:%.*]] = load <2 x double>, ptr [[TMP883]], align 8
-; CHECK-NEXT: [[VEC_GEP1220:%.*]] = getelementptr double, ptr [[TMP883]], i64 8
-; CHECK-NEXT: [[COL_LOAD1221:%.*]] = load <2 x double>, ptr [[VEC_GEP1220]], align 8
-; CHECK-NEXT: [[TMP884:%.*]] = getelementptr double, ptr [[B]], i64 52
-; CHECK-NEXT: [[COL_LOAD1222:%.*]] = load <2 x double>, ptr [[TMP884]], align 8
-; CHECK-NEXT: [[VEC_GEP1223:%.*]] = getelementptr double, ptr [[TMP884]], i64 8
-; CHECK-NEXT: [[COL_LOAD1224:%.*]] = load <2 x double>, ptr [[VEC_GEP1223]], align 8
-; CHECK-NEXT: [[BLOCK1225:%.*]] = shufflevector <2 x double> [[TMP876]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK1226:%.*]] = shufflevector <2 x double> [[COL_LOAD1219]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP885:%.*]] = extractelement <2 x double> [[COL_LOAD1222]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1227:%.*]] = insertelement <2 x double> poison, double [[TMP885]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1228:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1227]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP886:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1226]], <2 x double> [[SPLAT_SPLAT1228]], <2 x double> [[BLOCK1225]])
-; CHECK-NEXT: [[BLOCK1229:%.*]] = shufflevector <2 x double> [[COL_LOAD1221]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP887:%.*]] = extractelement <2 x double> [[COL_LOAD1222]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1230:%.*]] = insertelement <2 x double> poison, double [[TMP887]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1231:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1230]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP888:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1229]], <2 x double> [[SPLAT_SPLAT1231]], <2 x double> [[TMP886]])
-; CHECK-NEXT: [[TMP889:%.*]] = shufflevector <2 x double> [[TMP888]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP890:%.*]] = shufflevector <2 x double> [[TMP876]], <2 x double> [[TMP889]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK1232:%.*]] = shufflevector <2 x double> [[TMP882]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK1233:%.*]] = shufflevector <2 x double> [[COL_LOAD1219]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP891:%.*]] = extractelement <2 x double> [[COL_LOAD1224]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1234:%.*]] = insertelement <2 x double> poison, double [[TMP891]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1235:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1234]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP892:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1233]], <2 x double> [[SPLAT_SPLAT1235]], <2 x double> [[BLOCK1232]])
-; CHECK-NEXT: [[BLOCK1236:%.*]] = shufflevector <2 x double> [[COL_LOAD1221]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP893:%.*]] = extractelement <2 x double> [[COL_LOAD1224]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1237:%.*]] = insertelement <2 x double> poison, double [[TMP893]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1238:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1237]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP894:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1236]], <2 x double> [[SPLAT_SPLAT1238]], <2 x double> [[TMP892]])
-; CHECK-NEXT: [[TMP895:%.*]] = shufflevector <2 x double> [[TMP894]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP896:%.*]] = shufflevector <2 x double> [[TMP882]], <2 x double> [[TMP895]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP897:%.*]] = getelementptr double, ptr [[A]], i64 54
-; CHECK-NEXT: [[COL_LOAD1239:%.*]] = load <2 x double>, ptr [[TMP897]], align 8
-; CHECK-NEXT: [[VEC_GEP1240:%.*]] = getelementptr double, ptr [[TMP897]], i64 8
-; CHECK-NEXT: [[COL_LOAD1241:%.*]] = load <2 x double>, ptr [[VEC_GEP1240]], align 8
-; CHECK-NEXT: [[TMP898:%.*]] = getelementptr double, ptr [[B]], i64 54
-; CHECK-NEXT: [[COL_LOAD1242:%.*]] = load <2 x double>, ptr [[TMP898]], align 8
-; CHECK-NEXT: [[VEC_GEP1243:%.*]] = getelementptr double, ptr [[TMP898]], i64 8
-; CHECK-NEXT: [[COL_LOAD1244:%.*]] = load <2 x double>, ptr [[VEC_GEP1243]], align 8
-; CHECK-NEXT: [[BLOCK1245:%.*]] = shufflevector <2 x double> [[TMP890]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK1246:%.*]] = shufflevector <2 x double> [[COL_LOAD1239]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP899:%.*]] = extractelement <2 x double> [[COL_LOAD1242]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1247:%.*]] = insertelement <2 x double> poison, double [[TMP899]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1248:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1247]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP900:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1246]], <2 x double> [[SPLAT_SPLAT1248]], <2 x double> [[BLOCK1245]])
-; CHECK-NEXT: [[BLOCK1249:%.*]] = shufflevector <2 x double> [[COL_LOAD1241]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP901:%.*]] = extractelement <2 x double> [[COL_LOAD1242]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1250:%.*]] = insertelement <2 x double> poison, double [[TMP901]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1251:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1250]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP902:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1249]], <2 x double> [[SPLAT_SPLAT1251]], <2 x double> [[TMP900]])
-; CHECK-NEXT: [[TMP903:%.*]] = shufflevector <2 x double> [[TMP902]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP904:%.*]] = shufflevector <2 x double> [[TMP890]], <2 x double> [[TMP903]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[BLOCK1252:%.*]] = shufflevector <2 x double> [[TMP896]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[BLOCK1253:%.*]] = shufflevector <2 x double> [[COL_LOAD1239]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP905:%.*]] = extractelement <2 x double> [[COL_LOAD1244]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1254:%.*]] = insertelement <2 x double> poison, double [[TMP905]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1255:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1254]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP906:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1253]], <2 x double> [[SPLAT_SPLAT1255]], <2 x double> [[BLOCK1252]])
-; CHECK-NEXT: [[BLOCK1256:%.*]] = shufflevector <2 x double> [[COL_LOAD1241]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP907:%.*]] = extractelement <2 x double> [[COL_LOAD1244]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT1257:%.*]] = insertelement <2 x double> poison, double [[TMP907]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT1258:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1257]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP908:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1256]], <2 x double> [[SPLAT_SPLAT1258]], <2 x double> [[TMP906]])
-; CHECK-NEXT: [[TMP909:%.*]] = shufflevector <2 x double> [[TMP908]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP910:%.*]] = shufflevector <2 x double> [[TMP896]], <2 x double> [[TMP909]], <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP911:%.*]] = getelementptr double, ptr [[C]], i64 54
-; CHECK-NEXT: store <2 x double> [[TMP904]], ptr [[TMP911]], align 8
-; CHECK-NEXT: [[VEC_GEP1259:%.*]] = getelementptr double, ptr [[TMP911]], i64 8
-; CHECK-NEXT: store <2 x double> [[TMP910]], ptr [[VEC_GEP1259]], align 8
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[COLS_HEADER:.*]]
+; CHECK: [[COLS_HEADER]]:
+; CHECK-NEXT: [[COLS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[COLS_STEP:%.*]], %[[COLS_LATCH:.*]] ]
+; CHECK-NEXT: br label %[[COLS_BODY:.*]]
+; CHECK: [[COLS_BODY]]:
+; CHECK-NEXT: br label %[[ROWS_HEADER:.*]]
+; CHECK: [[ROWS_HEADER]]:
+; CHECK-NEXT: [[ROWS_IV:%.*]] = phi i64 [ 0, %[[COLS_BODY]] ], [ [[ROWS_STEP:%.*]], %[[ROWS_LATCH:.*]] ]
+; CHECK-NEXT: br label %[[ROWS_BODY:.*]]
+; CHECK: [[ROWS_BODY]]:
+; CHECK-NEXT: br label %[[INNER_HEADER:.*]]
+; CHECK: [[INNER_HEADER]]:
+; CHECK-NEXT: [[INNER_IV:%.*]] = phi i64 [ 0, %[[ROWS_BODY]] ], [ [[INNER_STEP:%.*]], %[[INNER_LATCH:.*]] ]
+; CHECK-NEXT: [[TMP912:%.*]] = phi <2 x double> [ zeroinitializer, %[[ROWS_BODY]] ], [ [[TMP921:%.*]], %[[INNER_LATCH]] ]
+; CHECK-NEXT: [[TMP913:%.*]] = phi <2 x double> [ zeroinitializer, %[[ROWS_BODY]] ], [ [[TMP927:%.*]], %[[INNER_LATCH]] ]
+; CHECK-NEXT: br label %[[INNER_BODY:.*]]
+; CHECK: [[INNER_BODY]]:
+; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[INNER_IV]], 8
+; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], [[ROWS_IV]]
+; CHECK-NEXT: [[TMP914:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT: [[COL_LOAD1240:%.*]] = load <2 x double>, ptr [[TMP914]], align 8
+; CHECK-NEXT: [[VEC_GEP1241:%.*]] = getelementptr double, ptr [[TMP914]], i64 8
+; CHECK-NEXT: [[COL_LOAD1243:%.*]] = load <2 x double>, ptr [[VEC_GEP1241]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[COLS_IV]], 8
+; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP3]], [[INNER_IV]]
+; CHECK-NEXT: [[TMP915:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP4]]
+; CHECK-NEXT: [[COL_LOAD1245:%.*]] = load <2 x double>, ptr [[TMP915]], align 8
+; CHECK-NEXT: [[VEC_GEP1244:%.*]] = getelementptr double, ptr [[TMP915]], i64 8
+; CHECK-NEXT: [[COL_LOAD1246:%.*]] = load <2 x double>, ptr [[VEC_GEP1244]], align 8
+; CHECK-NEXT: [[BLOCK1247:%.*]] = shufflevector <2 x double> [[TMP912]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK1248:%.*]] = shufflevector <2 x double> [[COL_LOAD1240]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP916:%.*]] = extractelement <2 x double> [[COL_LOAD1245]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT1248:%.*]] = insertelement <2 x double> poison, double [[TMP916]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT1249:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1248]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP917:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1248]], <2 x double> [[SPLAT_SPLAT1249]], <2 x double> [[BLOCK1247]])
+; CHECK-NEXT: [[BLOCK1250:%.*]] = shufflevector <2 x double> [[COL_LOAD1243]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP918:%.*]] = extractelement <2 x double> [[COL_LOAD1245]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT1251:%.*]] = insertelement <2 x double> poison, double [[TMP918]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT1252:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1251]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP919:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1250]], <2 x double> [[SPLAT_SPLAT1252]], <2 x double> [[TMP917]])
+; CHECK-NEXT: [[TMP920:%.*]] = shufflevector <2 x double> [[TMP919]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP921]] = shufflevector <2 x double> [[TMP912]], <2 x double> [[TMP920]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK1254:%.*]] = shufflevector <2 x double> [[TMP913]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK1255:%.*]] = shufflevector <2 x double> [[COL_LOAD1240]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP922:%.*]] = extractelement <2 x double> [[COL_LOAD1246]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT1255:%.*]] = insertelement <2 x double> poison, double [[TMP922]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT1256:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1255]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP923:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1255]], <2 x double> [[SPLAT_SPLAT1256]], <2 x double> [[BLOCK1254]])
+; CHECK-NEXT: [[BLOCK1257:%.*]] = shufflevector <2 x double> [[COL_LOAD1243]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP924:%.*]] = extractelement <2 x double> [[COL_LOAD1246]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT1258:%.*]] = insertelement <2 x double> poison, double [[TMP924]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT1259:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT1258]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP925:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK1257]], <2 x double> [[SPLAT_SPLAT1259]], <2 x double> [[TMP923]])
+; CHECK-NEXT: [[TMP926:%.*]] = shufflevector <2 x double> [[TMP925]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP927]] = shufflevector <2 x double> [[TMP913]], <2 x double> [[TMP926]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: br label %[[INNER_LATCH]]
+; CHECK: [[INNER_LATCH]]:
+; CHECK-NEXT: [[INNER_STEP]] = add i64 [[INNER_IV]], 2
+; CHECK-NEXT: [[INNER_COND:%.*]] = icmp ne i64 [[INNER_STEP]], 8
+; CHECK-NEXT: br i1 [[INNER_COND]], label %[[INNER_HEADER]], label %[[ROWS_LATCH]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[ROWS_LATCH]]:
+; CHECK-NEXT: [[ROWS_STEP]] = add i64 [[ROWS_IV]], 2
+; CHECK-NEXT: [[ROWS_COND:%.*]] = icmp ne i64 [[ROWS_STEP]], 8
+; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[COLS_IV]], 8
+; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[TMP18]], [[ROWS_IV]]
+; CHECK-NEXT: [[TMP928:%.*]] = getelementptr double, ptr [[C]], i64 [[TMP19]]
+; CHECK-NEXT: store <2 x double> [[TMP921]], ptr [[TMP928]], align 8
+; CHECK-NEXT: [[VEC_GEP1260:%.*]] = getelementptr double, ptr [[TMP928]], i64 8
+; CHECK-NEXT: store <2 x double> [[TMP927]], ptr [[VEC_GEP1260]], align 8
+; CHECK-NEXT: br i1 [[ROWS_COND]], label %[[ROWS_HEADER]], label %[[COLS_LATCH]]
+; CHECK: [[COLS_LATCH]]:
+; CHECK-NEXT: [[COLS_STEP]] = add i64 [[COLS_IV]], 2
+; CHECK-NEXT: [[COLS_COND:%.*]] = icmp ne i64 [[COLS_STEP]], 8
+; CHECK-NEXT: br i1 [[COLS_COND]], label %[[COLS_HEADER]], label %[[CONTINUE:.*]]
+; CHECK: [[CONTINUE]]:
; CHECK-NEXT: ret void
;
entry:
@@ -3177,3 +1044,7 @@ entry:
store <64 x double> %c, ptr %C, align 8
ret void
}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.unroll.count", i32 4}
+;.
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll
index 8c2cc8e799bcd..77da175b7478b 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=lower-matrix-intrinsics,instcombine -fuse-matrix-use-loops -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s
+; RUN: opt -passes=lower-matrix-intrinsics,instcombine -fuse-matrix-loops-threshold=0 -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s
; REQUIRES: aarch64-registered-target
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-volatile.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-volatile.ll
index fb1925d48bb96..0852940cd226e 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-volatile.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-volatile.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=lower-matrix-intrinsics -fuse-matrix-use-loops -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s
+; RUN: opt -passes=lower-matrix-intrinsics -fuse-matrix-loops-threshold=0 -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s
; REQUIRES: aarch64-registered-target
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
index 155f7755c2095..430358f0a5138 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=lower-matrix-intrinsics,instcombine -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s
+; RUN: opt -passes=lower-matrix-intrinsics,instcombine -fuse-matrix-loops-threshold=9999 -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s
; REQUIRES: aarch64-registered-target
More information about the llvm-commits
mailing list