[llvm] dd9fe97 - [Matrix] Add test showing excessive matrix codegen.
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 30 10:37:28 PST 2026
Author: Florian Hahn
Date: 2026-01-30T18:35:10Z
New Revision: dd9fe97cc70b38839f8800a68188a5796d58eb08
URL: https://github.com/llvm/llvm-project/commit/dd9fe97cc70b38839f8800a68188a5796d58eb08
DIFF: https://github.com/llvm/llvm-project/commit/dd9fe97cc70b38839f8800a68188a5796d58eb08.diff
LOG: [Matrix] Add test showing excessive matrix codegen.
Add tests showing excessive codegen for small number of matrix ops.
Added:
llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops-large-matrixes.ll
Modified:
Removed:
################################################################################
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops-large-matrixes.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops-large-matrixes.ll
new file mode 100644
index 0000000000000..5ec6ac8f84e52
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops-large-matrixes.ll
@@ -0,0 +1,2065 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=lower-matrix-intrinsics -matrix-allow-contract -force-fuse-matrix %s -S | FileCheck %s
+
+; REQUIRES: aarch64-registered-target
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "aarch64-apple-ios"
+
+define void @multiply_6x6x6(ptr noalias %A, ptr noalias %B, ptr noalias %C) {
+; CHECK-LABEL: define void @multiply_6x6x6(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
+; CHECK-NEXT: [[COL_LOAD:%.*]] = load <4 x double>, ptr [[TMP0]], align 8
+; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 6
+; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <4 x double>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[TMP0]], i64 12
+; CHECK-NEXT: [[COL_LOAD3:%.*]] = load <4 x double>, ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT: [[VEC_GEP4:%.*]] = getelementptr double, ptr [[TMP0]], i64 18
+; CHECK-NEXT: [[COL_LOAD5:%.*]] = load <4 x double>, ptr [[VEC_GEP4]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0
+; CHECK-NEXT: [[COL_LOAD6:%.*]] = load <4 x double>, ptr [[TMP1]], align 8
+; CHECK-NEXT: [[VEC_GEP7:%.*]] = getelementptr double, ptr [[TMP1]], i64 6
+; CHECK-NEXT: [[COL_LOAD8:%.*]] = load <4 x double>, ptr [[VEC_GEP7]], align 8
+; CHECK-NEXT: [[VEC_GEP9:%.*]] = getelementptr double, ptr [[TMP1]], i64 12
+; CHECK-NEXT: [[COL_LOAD10:%.*]] = load <4 x double>, ptr [[VEC_GEP9]], align 8
+; CHECK-NEXT: [[VEC_GEP11:%.*]] = getelementptr double, ptr [[TMP1]], i64 18
+; CHECK-NEXT: [[COL_LOAD12:%.*]] = load <4 x double>, ptr [[VEC_GEP11]], align 8
+; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <4 x double> [[COL_LOAD]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x double> [[COL_LOAD6]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = fmul contract <2 x double> [[BLOCK]], [[SPLAT_SPLAT]]
+; CHECK-NEXT: [[BLOCK13:%.*]] = shufflevector <4 x double> [[COL_LOAD1]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[COL_LOAD6]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT14:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT15:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT14]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK13]], <2 x double> [[SPLAT_SPLAT15]], <2 x double> [[TMP3]])
+; CHECK-NEXT: [[BLOCK16:%.*]] = shufflevector <4 x double> [[COL_LOAD3]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x double> [[COL_LOAD6]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT17:%.*]] = insertelement <2 x double> poison, double [[TMP6]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT18:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT17]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK16]], <2 x double> [[SPLAT_SPLAT18]], <2 x double> [[TMP5]])
+; CHECK-NEXT: [[BLOCK19:%.*]] = shufflevector <4 x double> [[COL_LOAD5]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x double> [[COL_LOAD6]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT20:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT21:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT20]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK19]], <2 x double> [[SPLAT_SPLAT21]], <2 x double> [[TMP7]])
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x double> zeroinitializer, <4 x double> [[TMP10]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK23:%.*]] = shufflevector <4 x double> [[COL_LOAD]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x double> [[COL_LOAD6]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT24:%.*]] = insertelement <2 x double> poison, double [[TMP12]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT25:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT24]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = fmul contract <2 x double> [[BLOCK23]], [[SPLAT_SPLAT25]]
+; CHECK-NEXT: [[BLOCK26:%.*]] = shufflevector <4 x double> [[COL_LOAD1]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x double> [[COL_LOAD6]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT27:%.*]] = insertelement <2 x double> poison, double [[TMP14]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT28:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT27]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK26]], <2 x double> [[SPLAT_SPLAT28]], <2 x double> [[TMP13]])
+; CHECK-NEXT: [[BLOCK29:%.*]] = shufflevector <4 x double> [[COL_LOAD3]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x double> [[COL_LOAD6]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT30:%.*]] = insertelement <2 x double> poison, double [[TMP16]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT31:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT30]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK29]], <2 x double> [[SPLAT_SPLAT31]], <2 x double> [[TMP15]])
+; CHECK-NEXT: [[BLOCK32:%.*]] = shufflevector <4 x double> [[COL_LOAD5]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x double> [[COL_LOAD6]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT33:%.*]] = insertelement <2 x double> poison, double [[TMP18]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT34:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT33]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP19:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK32]], <2 x double> [[SPLAT_SPLAT34]], <2 x double> [[TMP17]])
+; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[BLOCK35:%.*]] = shufflevector <4 x double> [[COL_LOAD]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x double> [[COL_LOAD8]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT36:%.*]] = insertelement <2 x double> poison, double [[TMP22]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT37:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT36]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP23:%.*]] = fmul contract <2 x double> [[BLOCK35]], [[SPLAT_SPLAT37]]
+; CHECK-NEXT: [[BLOCK38:%.*]] = shufflevector <4 x double> [[COL_LOAD1]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x double> [[COL_LOAD8]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT39:%.*]] = insertelement <2 x double> poison, double [[TMP24]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT40:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT39]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK38]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP23]])
+; CHECK-NEXT: [[BLOCK41:%.*]] = shufflevector <4 x double> [[COL_LOAD3]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x double> [[COL_LOAD8]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT42:%.*]] = insertelement <2 x double> poison, double [[TMP26]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT43:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT42]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK41]], <2 x double> [[SPLAT_SPLAT43]], <2 x double> [[TMP25]])
+; CHECK-NEXT: [[BLOCK44:%.*]] = shufflevector <4 x double> [[COL_LOAD5]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x double> [[COL_LOAD8]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT45:%.*]] = insertelement <2 x double> poison, double [[TMP28]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT46:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT45]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP29:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK44]], <2 x double> [[SPLAT_SPLAT46]], <2 x double> [[TMP27]])
+; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <2 x double> [[TMP29]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x double> zeroinitializer, <4 x double> [[TMP30]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK47:%.*]] = shufflevector <4 x double> [[TMP31]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK48:%.*]] = shufflevector <4 x double> [[COL_LOAD]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x double> [[COL_LOAD8]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT49:%.*]] = insertelement <2 x double> poison, double [[TMP32]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT49]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP33:%.*]] = fmul contract <2 x double> [[BLOCK48]], [[SPLAT_SPLAT50]]
+; CHECK-NEXT: [[BLOCK51:%.*]] = shufflevector <4 x double> [[COL_LOAD1]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x double> [[COL_LOAD8]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT52:%.*]] = insertelement <2 x double> poison, double [[TMP34]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT53:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT52]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP35:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK51]], <2 x double> [[SPLAT_SPLAT53]], <2 x double> [[TMP33]])
+; CHECK-NEXT: [[BLOCK54:%.*]] = shufflevector <4 x double> [[COL_LOAD3]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x double> [[COL_LOAD8]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT55:%.*]] = insertelement <2 x double> poison, double [[TMP36]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT56:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT55]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK54]], <2 x double> [[SPLAT_SPLAT56]], <2 x double> [[TMP35]])
+; CHECK-NEXT: [[BLOCK57:%.*]] = shufflevector <4 x double> [[COL_LOAD5]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x double> [[COL_LOAD8]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT58:%.*]] = insertelement <2 x double> poison, double [[TMP38]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT59:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT58]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK57]], <2 x double> [[SPLAT_SPLAT59]], <2 x double> [[TMP37]])
+; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <2 x double> [[TMP39]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <4 x double> [[TMP31]], <4 x double> [[TMP40]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[BLOCK60:%.*]] = shufflevector <4 x double> [[COL_LOAD]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x double> [[COL_LOAD10]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT61:%.*]] = insertelement <2 x double> poison, double [[TMP42]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT62:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT61]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP43:%.*]] = fmul contract <2 x double> [[BLOCK60]], [[SPLAT_SPLAT62]]
+; CHECK-NEXT: [[BLOCK63:%.*]] = shufflevector <4 x double> [[COL_LOAD1]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x double> [[COL_LOAD10]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT64:%.*]] = insertelement <2 x double> poison, double [[TMP44]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT65:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT64]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP45:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK63]], <2 x double> [[SPLAT_SPLAT65]], <2 x double> [[TMP43]])
+; CHECK-NEXT: [[BLOCK66:%.*]] = shufflevector <4 x double> [[COL_LOAD3]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x double> [[COL_LOAD10]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT67:%.*]] = insertelement <2 x double> poison, double [[TMP46]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT68:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT67]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK66]], <2 x double> [[SPLAT_SPLAT68]], <2 x double> [[TMP45]])
+; CHECK-NEXT: [[BLOCK69:%.*]] = shufflevector <4 x double> [[COL_LOAD5]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP48:%.*]] = extractelement <4 x double> [[COL_LOAD10]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT70:%.*]] = insertelement <2 x double> poison, double [[TMP48]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT71:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT70]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP49:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK69]], <2 x double> [[SPLAT_SPLAT71]], <2 x double> [[TMP47]])
+; CHECK-NEXT: [[TMP50:%.*]] = shufflevector <2 x double> [[TMP49]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <4 x double> zeroinitializer, <4 x double> [[TMP50]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK72:%.*]] = shufflevector <4 x double> [[TMP51]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK73:%.*]] = shufflevector <4 x double> [[COL_LOAD]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP52:%.*]] = extractelement <4 x double> [[COL_LOAD10]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT74:%.*]] = insertelement <2 x double> poison, double [[TMP52]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT75:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT74]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP53:%.*]] = fmul contract <2 x double> [[BLOCK73]], [[SPLAT_SPLAT75]]
+; CHECK-NEXT: [[BLOCK76:%.*]] = shufflevector <4 x double> [[COL_LOAD1]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x double> [[COL_LOAD10]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT77:%.*]] = insertelement <2 x double> poison, double [[TMP54]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT78:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT77]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP55:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK76]], <2 x double> [[SPLAT_SPLAT78]], <2 x double> [[TMP53]])
+; CHECK-NEXT: [[BLOCK79:%.*]] = shufflevector <4 x double> [[COL_LOAD3]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP56:%.*]] = extractelement <4 x double> [[COL_LOAD10]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT80:%.*]] = insertelement <2 x double> poison, double [[TMP56]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT81:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT80]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP57:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK79]], <2 x double> [[SPLAT_SPLAT81]], <2 x double> [[TMP55]])
+; CHECK-NEXT: [[BLOCK82:%.*]] = shufflevector <4 x double> [[COL_LOAD5]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP58:%.*]] = extractelement <4 x double> [[COL_LOAD10]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT83:%.*]] = insertelement <2 x double> poison, double [[TMP58]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT84:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT83]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP59:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK82]], <2 x double> [[SPLAT_SPLAT84]], <2 x double> [[TMP57]])
+; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <2 x double> [[TMP59]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <4 x double> [[TMP51]], <4 x double> [[TMP60]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[BLOCK85:%.*]] = shufflevector <4 x double> [[COL_LOAD]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x double> [[COL_LOAD12]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT86:%.*]] = insertelement <2 x double> poison, double [[TMP62]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT87:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT86]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP63:%.*]] = fmul contract <2 x double> [[BLOCK85]], [[SPLAT_SPLAT87]]
+; CHECK-NEXT: [[BLOCK88:%.*]] = shufflevector <4 x double> [[COL_LOAD1]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP64:%.*]] = extractelement <4 x double> [[COL_LOAD12]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT89:%.*]] = insertelement <2 x double> poison, double [[TMP64]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT90:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT89]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP65:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK88]], <2 x double> [[SPLAT_SPLAT90]], <2 x double> [[TMP63]])
+; CHECK-NEXT: [[BLOCK91:%.*]] = shufflevector <4 x double> [[COL_LOAD3]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP66:%.*]] = extractelement <4 x double> [[COL_LOAD12]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT92:%.*]] = insertelement <2 x double> poison, double [[TMP66]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT93:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT92]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP67:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK91]], <2 x double> [[SPLAT_SPLAT93]], <2 x double> [[TMP65]])
+; CHECK-NEXT: [[BLOCK94:%.*]] = shufflevector <4 x double> [[COL_LOAD5]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x double> [[COL_LOAD12]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT95:%.*]] = insertelement <2 x double> poison, double [[TMP68]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT96:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT95]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP69:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK94]], <2 x double> [[SPLAT_SPLAT96]], <2 x double> [[TMP67]])
+; CHECK-NEXT: [[TMP70:%.*]] = shufflevector <2 x double> [[TMP69]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP71:%.*]] = shufflevector <4 x double> zeroinitializer, <4 x double> [[TMP70]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK97:%.*]] = shufflevector <4 x double> [[TMP71]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK98:%.*]] = shufflevector <4 x double> [[COL_LOAD]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP72:%.*]] = extractelement <4 x double> [[COL_LOAD12]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT99:%.*]] = insertelement <2 x double> poison, double [[TMP72]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT100:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT99]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP73:%.*]] = fmul contract <2 x double> [[BLOCK98]], [[SPLAT_SPLAT100]]
+; CHECK-NEXT: [[BLOCK101:%.*]] = shufflevector <4 x double> [[COL_LOAD1]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x double> [[COL_LOAD12]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT102:%.*]] = insertelement <2 x double> poison, double [[TMP74]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT103:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT102]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP75:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK101]], <2 x double> [[SPLAT_SPLAT103]], <2 x double> [[TMP73]])
+; CHECK-NEXT: [[BLOCK104:%.*]] = shufflevector <4 x double> [[COL_LOAD3]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP76:%.*]] = extractelement <4 x double> [[COL_LOAD12]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT105:%.*]] = insertelement <2 x double> poison, double [[TMP76]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT106:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT105]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP77:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK104]], <2 x double> [[SPLAT_SPLAT106]], <2 x double> [[TMP75]])
+; CHECK-NEXT: [[BLOCK107:%.*]] = shufflevector <4 x double> [[COL_LOAD5]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP78:%.*]] = extractelement <4 x double> [[COL_LOAD12]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT108:%.*]] = insertelement <2 x double> poison, double [[TMP78]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT109:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT108]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP79:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK107]], <2 x double> [[SPLAT_SPLAT109]], <2 x double> [[TMP77]])
+; CHECK-NEXT: [[TMP80:%.*]] = shufflevector <2 x double> [[TMP79]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP81:%.*]] = shufflevector <4 x double> [[TMP71]], <4 x double> [[TMP80]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[TMP82:%.*]] = getelementptr double, ptr [[A]], i64 24
+; CHECK-NEXT: [[COL_LOAD110:%.*]] = load <4 x double>, ptr [[TMP82]], align 8
+; CHECK-NEXT: [[VEC_GEP111:%.*]] = getelementptr double, ptr [[TMP82]], i64 6
+; CHECK-NEXT: [[COL_LOAD112:%.*]] = load <4 x double>, ptr [[VEC_GEP111]], align 8
+; CHECK-NEXT: [[TMP83:%.*]] = getelementptr double, ptr [[B]], i64 4
+; CHECK-NEXT: [[COL_LOAD113:%.*]] = load <2 x double>, ptr [[TMP83]], align 8
+; CHECK-NEXT: [[VEC_GEP114:%.*]] = getelementptr double, ptr [[TMP83]], i64 6
+; CHECK-NEXT: [[COL_LOAD115:%.*]] = load <2 x double>, ptr [[VEC_GEP114]], align 8
+; CHECK-NEXT: [[VEC_GEP116:%.*]] = getelementptr double, ptr [[TMP83]], i64 12
+; CHECK-NEXT: [[COL_LOAD117:%.*]] = load <2 x double>, ptr [[VEC_GEP116]], align 8
+; CHECK-NEXT: [[VEC_GEP118:%.*]] = getelementptr double, ptr [[TMP83]], i64 18
+; CHECK-NEXT: [[COL_LOAD119:%.*]] = load <2 x double>, ptr [[VEC_GEP118]], align 8
+; CHECK-NEXT: [[BLOCK120:%.*]] = shufflevector <4 x double> [[TMP21]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK121:%.*]] = shufflevector <4 x double> [[COL_LOAD110]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP84:%.*]] = extractelement <2 x double> [[COL_LOAD113]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT122:%.*]] = insertelement <2 x double> poison, double [[TMP84]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT123:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT122]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP85:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK121]], <2 x double> [[SPLAT_SPLAT123]], <2 x double> [[BLOCK120]])
+; CHECK-NEXT: [[BLOCK124:%.*]] = shufflevector <4 x double> [[COL_LOAD112]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP86:%.*]] = extractelement <2 x double> [[COL_LOAD113]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT125:%.*]] = insertelement <2 x double> poison, double [[TMP86]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT126:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT125]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP87:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK124]], <2 x double> [[SPLAT_SPLAT126]], <2 x double> [[TMP85]])
+; CHECK-NEXT: [[TMP88:%.*]] = shufflevector <2 x double> [[TMP87]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP89:%.*]] = shufflevector <4 x double> [[TMP21]], <4 x double> [[TMP88]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK127:%.*]] = shufflevector <4 x double> [[TMP89]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK128:%.*]] = shufflevector <4 x double> [[COL_LOAD110]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP90:%.*]] = extractelement <2 x double> [[COL_LOAD113]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT129:%.*]] = insertelement <2 x double> poison, double [[TMP90]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT130:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT129]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP91:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK128]], <2 x double> [[SPLAT_SPLAT130]], <2 x double> [[BLOCK127]])
+; CHECK-NEXT: [[BLOCK131:%.*]] = shufflevector <4 x double> [[COL_LOAD112]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP92:%.*]] = extractelement <2 x double> [[COL_LOAD113]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT132:%.*]] = insertelement <2 x double> poison, double [[TMP92]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT133:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT132]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP93:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK131]], <2 x double> [[SPLAT_SPLAT133]], <2 x double> [[TMP91]])
+; CHECK-NEXT: [[TMP94:%.*]] = shufflevector <2 x double> [[TMP93]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP95:%.*]] = shufflevector <4 x double> [[TMP89]], <4 x double> [[TMP94]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[BLOCK134:%.*]] = shufflevector <4 x double> [[TMP41]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK135:%.*]] = shufflevector <4 x double> [[COL_LOAD110]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP96:%.*]] = extractelement <2 x double> [[COL_LOAD115]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT136:%.*]] = insertelement <2 x double> poison, double [[TMP96]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT137:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT136]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP97:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK135]], <2 x double> [[SPLAT_SPLAT137]], <2 x double> [[BLOCK134]])
+; CHECK-NEXT: [[BLOCK138:%.*]] = shufflevector <4 x double> [[COL_LOAD112]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP98:%.*]] = extractelement <2 x double> [[COL_LOAD115]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT139:%.*]] = insertelement <2 x double> poison, double [[TMP98]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT140:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT139]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP99:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK138]], <2 x double> [[SPLAT_SPLAT140]], <2 x double> [[TMP97]])
+; CHECK-NEXT: [[TMP100:%.*]] = shufflevector <2 x double> [[TMP99]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP101:%.*]] = shufflevector <4 x double> [[TMP41]], <4 x double> [[TMP100]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK141:%.*]] = shufflevector <4 x double> [[TMP101]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK142:%.*]] = shufflevector <4 x double> [[COL_LOAD110]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP102:%.*]] = extractelement <2 x double> [[COL_LOAD115]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT143:%.*]] = insertelement <2 x double> poison, double [[TMP102]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT144:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT143]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP103:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK142]], <2 x double> [[SPLAT_SPLAT144]], <2 x double> [[BLOCK141]])
+; CHECK-NEXT: [[BLOCK145:%.*]] = shufflevector <4 x double> [[COL_LOAD112]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP104:%.*]] = extractelement <2 x double> [[COL_LOAD115]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT146:%.*]] = insertelement <2 x double> poison, double [[TMP104]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT147:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT146]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP105:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK145]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP103]])
+; CHECK-NEXT: [[TMP106:%.*]] = shufflevector <2 x double> [[TMP105]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP107:%.*]] = shufflevector <4 x double> [[TMP101]], <4 x double> [[TMP106]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[BLOCK148:%.*]] = shufflevector <4 x double> [[TMP61]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK149:%.*]] = shufflevector <4 x double> [[COL_LOAD110]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP108:%.*]] = extractelement <2 x double> [[COL_LOAD117]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT150:%.*]] = insertelement <2 x double> poison, double [[TMP108]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT151:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT150]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP109:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK149]], <2 x double> [[SPLAT_SPLAT151]], <2 x double> [[BLOCK148]])
+; CHECK-NEXT: [[BLOCK152:%.*]] = shufflevector <4 x double> [[COL_LOAD112]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP110:%.*]] = extractelement <2 x double> [[COL_LOAD117]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT153:%.*]] = insertelement <2 x double> poison, double [[TMP110]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT154:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT153]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP111:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK152]], <2 x double> [[SPLAT_SPLAT154]], <2 x double> [[TMP109]])
+; CHECK-NEXT: [[TMP112:%.*]] = shufflevector <2 x double> [[TMP111]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP113:%.*]] = shufflevector <4 x double> [[TMP61]], <4 x double> [[TMP112]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK155:%.*]] = shufflevector <4 x double> [[TMP113]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK156:%.*]] = shufflevector <4 x double> [[COL_LOAD110]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP114:%.*]] = extractelement <2 x double> [[COL_LOAD117]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT157:%.*]] = insertelement <2 x double> poison, double [[TMP114]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT158:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT157]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP115:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK156]], <2 x double> [[SPLAT_SPLAT158]], <2 x double> [[BLOCK155]])
+; CHECK-NEXT: [[BLOCK159:%.*]] = shufflevector <4 x double> [[COL_LOAD112]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP116:%.*]] = extractelement <2 x double> [[COL_LOAD117]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT160:%.*]] = insertelement <2 x double> poison, double [[TMP116]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT161:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT160]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP117:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK159]], <2 x double> [[SPLAT_SPLAT161]], <2 x double> [[TMP115]])
+; CHECK-NEXT: [[TMP118:%.*]] = shufflevector <2 x double> [[TMP117]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP119:%.*]] = shufflevector <4 x double> [[TMP113]], <4 x double> [[TMP118]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[BLOCK162:%.*]] = shufflevector <4 x double> [[TMP81]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK163:%.*]] = shufflevector <4 x double> [[COL_LOAD110]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP120:%.*]] = extractelement <2 x double> [[COL_LOAD119]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT164:%.*]] = insertelement <2 x double> poison, double [[TMP120]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT165:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT164]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP121:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK163]], <2 x double> [[SPLAT_SPLAT165]], <2 x double> [[BLOCK162]])
+; CHECK-NEXT: [[BLOCK166:%.*]] = shufflevector <4 x double> [[COL_LOAD112]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP122:%.*]] = extractelement <2 x double> [[COL_LOAD119]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT167:%.*]] = insertelement <2 x double> poison, double [[TMP122]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT168:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT167]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP123:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK166]], <2 x double> [[SPLAT_SPLAT168]], <2 x double> [[TMP121]])
+; CHECK-NEXT: [[TMP124:%.*]] = shufflevector <2 x double> [[TMP123]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP125:%.*]] = shufflevector <4 x double> [[TMP81]], <4 x double> [[TMP124]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK169:%.*]] = shufflevector <4 x double> [[TMP125]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK170:%.*]] = shufflevector <4 x double> [[COL_LOAD110]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP126:%.*]] = extractelement <2 x double> [[COL_LOAD119]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT171:%.*]] = insertelement <2 x double> poison, double [[TMP126]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT172:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT171]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP127:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK170]], <2 x double> [[SPLAT_SPLAT172]], <2 x double> [[BLOCK169]])
+; CHECK-NEXT: [[BLOCK173:%.*]] = shufflevector <4 x double> [[COL_LOAD112]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP128:%.*]] = extractelement <2 x double> [[COL_LOAD119]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT174:%.*]] = insertelement <2 x double> poison, double [[TMP128]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT175:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT174]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP129:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK173]], <2 x double> [[SPLAT_SPLAT175]], <2 x double> [[TMP127]])
+; CHECK-NEXT: [[TMP130:%.*]] = shufflevector <2 x double> [[TMP129]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP131:%.*]] = shufflevector <4 x double> [[TMP125]], <4 x double> [[TMP130]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[TMP132:%.*]] = getelementptr double, ptr [[C]], i64 0
+; CHECK-NEXT: store <4 x double> [[TMP95]], ptr [[TMP132]], align 8
+; CHECK-NEXT: [[VEC_GEP176:%.*]] = getelementptr double, ptr [[TMP132]], i64 6
+; CHECK-NEXT: store <4 x double> [[TMP107]], ptr [[VEC_GEP176]], align 8
+; CHECK-NEXT: [[VEC_GEP177:%.*]] = getelementptr double, ptr [[TMP132]], i64 12
+; CHECK-NEXT: store <4 x double> [[TMP119]], ptr [[VEC_GEP177]], align 8
+; CHECK-NEXT: [[VEC_GEP178:%.*]] = getelementptr double, ptr [[TMP132]], i64 18
+; CHECK-NEXT: store <4 x double> [[TMP131]], ptr [[VEC_GEP178]], align 8
+; CHECK-NEXT: [[TMP133:%.*]] = getelementptr double, ptr [[A]], i64 4
+; CHECK-NEXT: [[COL_LOAD179:%.*]] = load <2 x double>, ptr [[TMP133]], align 8
+; CHECK-NEXT: [[VEC_GEP180:%.*]] = getelementptr double, ptr [[TMP133]], i64 6
+; CHECK-NEXT: [[COL_LOAD181:%.*]] = load <2 x double>, ptr [[VEC_GEP180]], align 8
+; CHECK-NEXT: [[VEC_GEP182:%.*]] = getelementptr double, ptr [[TMP133]], i64 12
+; CHECK-NEXT: [[COL_LOAD183:%.*]] = load <2 x double>, ptr [[VEC_GEP182]], align 8
+; CHECK-NEXT: [[VEC_GEP184:%.*]] = getelementptr double, ptr [[TMP133]], i64 18
+; CHECK-NEXT: [[COL_LOAD185:%.*]] = load <2 x double>, ptr [[VEC_GEP184]], align 8
+; CHECK-NEXT: [[TMP134:%.*]] = getelementptr double, ptr [[B]], i64 0
+; CHECK-NEXT: [[COL_LOAD186:%.*]] = load <4 x double>, ptr [[TMP134]], align 8
+; CHECK-NEXT: [[VEC_GEP187:%.*]] = getelementptr double, ptr [[TMP134]], i64 6
+; CHECK-NEXT: [[COL_LOAD188:%.*]] = load <4 x double>, ptr [[VEC_GEP187]], align 8
+; CHECK-NEXT: [[VEC_GEP189:%.*]] = getelementptr double, ptr [[TMP134]], i64 12
+; CHECK-NEXT: [[COL_LOAD190:%.*]] = load <4 x double>, ptr [[VEC_GEP189]], align 8
+; CHECK-NEXT: [[VEC_GEP191:%.*]] = getelementptr double, ptr [[TMP134]], i64 18
+; CHECK-NEXT: [[COL_LOAD192:%.*]] = load <4 x double>, ptr [[VEC_GEP191]], align 8
+; CHECK-NEXT: [[BLOCK193:%.*]] = shufflevector <2 x double> [[COL_LOAD179]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP135:%.*]] = extractelement <4 x double> [[COL_LOAD186]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT194:%.*]] = insertelement <2 x double> poison, double [[TMP135]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT195:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT194]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP136:%.*]] = fmul contract <2 x double> [[BLOCK193]], [[SPLAT_SPLAT195]]
+; CHECK-NEXT: [[BLOCK196:%.*]] = shufflevector <2 x double> [[COL_LOAD181]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP137:%.*]] = extractelement <4 x double> [[COL_LOAD186]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT197:%.*]] = insertelement <2 x double> poison, double [[TMP137]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT198:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT197]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP138:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK196]], <2 x double> [[SPLAT_SPLAT198]], <2 x double> [[TMP136]])
+; CHECK-NEXT: [[BLOCK199:%.*]] = shufflevector <2 x double> [[COL_LOAD183]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP139:%.*]] = extractelement <4 x double> [[COL_LOAD186]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT200:%.*]] = insertelement <2 x double> poison, double [[TMP139]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT201:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT200]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP140:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK199]], <2 x double> [[SPLAT_SPLAT201]], <2 x double> [[TMP138]])
+; CHECK-NEXT: [[BLOCK202:%.*]] = shufflevector <2 x double> [[COL_LOAD185]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP141:%.*]] = extractelement <4 x double> [[COL_LOAD186]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT203:%.*]] = insertelement <2 x double> poison, double [[TMP141]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT204:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT203]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP142:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK202]], <2 x double> [[SPLAT_SPLAT204]], <2 x double> [[TMP140]])
+; CHECK-NEXT: [[TMP143:%.*]] = shufflevector <2 x double> [[TMP142]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP144:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP143]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK205:%.*]] = shufflevector <2 x double> [[COL_LOAD179]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP145:%.*]] = extractelement <4 x double> [[COL_LOAD188]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT206:%.*]] = insertelement <2 x double> poison, double [[TMP145]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT207:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT206]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP146:%.*]] = fmul contract <2 x double> [[BLOCK205]], [[SPLAT_SPLAT207]]
+; CHECK-NEXT: [[BLOCK208:%.*]] = shufflevector <2 x double> [[COL_LOAD181]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP147:%.*]] = extractelement <4 x double> [[COL_LOAD188]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT209:%.*]] = insertelement <2 x double> poison, double [[TMP147]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT210:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT209]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP148:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK208]], <2 x double> [[SPLAT_SPLAT210]], <2 x double> [[TMP146]])
+; CHECK-NEXT: [[BLOCK211:%.*]] = shufflevector <2 x double> [[COL_LOAD183]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP149:%.*]] = extractelement <4 x double> [[COL_LOAD188]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT212:%.*]] = insertelement <2 x double> poison, double [[TMP149]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT213:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT212]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP150:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK211]], <2 x double> [[SPLAT_SPLAT213]], <2 x double> [[TMP148]])
+; CHECK-NEXT: [[BLOCK214:%.*]] = shufflevector <2 x double> [[COL_LOAD185]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP151:%.*]] = extractelement <4 x double> [[COL_LOAD188]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT215:%.*]] = insertelement <2 x double> poison, double [[TMP151]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT216:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT215]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP152:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK214]], <2 x double> [[SPLAT_SPLAT216]], <2 x double> [[TMP150]])
+; CHECK-NEXT: [[TMP153:%.*]] = shufflevector <2 x double> [[TMP152]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP154:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP153]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK217:%.*]] = shufflevector <2 x double> [[COL_LOAD179]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP155:%.*]] = extractelement <4 x double> [[COL_LOAD190]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT218:%.*]] = insertelement <2 x double> poison, double [[TMP155]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT219:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT218]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP156:%.*]] = fmul contract <2 x double> [[BLOCK217]], [[SPLAT_SPLAT219]]
+; CHECK-NEXT: [[BLOCK220:%.*]] = shufflevector <2 x double> [[COL_LOAD181]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP157:%.*]] = extractelement <4 x double> [[COL_LOAD190]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT221:%.*]] = insertelement <2 x double> poison, double [[TMP157]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT222:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT221]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP158:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK220]], <2 x double> [[SPLAT_SPLAT222]], <2 x double> [[TMP156]])
+; CHECK-NEXT: [[BLOCK223:%.*]] = shufflevector <2 x double> [[COL_LOAD183]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP159:%.*]] = extractelement <4 x double> [[COL_LOAD190]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT224:%.*]] = insertelement <2 x double> poison, double [[TMP159]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT225:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT224]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP160:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK223]], <2 x double> [[SPLAT_SPLAT225]], <2 x double> [[TMP158]])
+; CHECK-NEXT: [[BLOCK226:%.*]] = shufflevector <2 x double> [[COL_LOAD185]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP161:%.*]] = extractelement <4 x double> [[COL_LOAD190]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT227:%.*]] = insertelement <2 x double> poison, double [[TMP161]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT228:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT227]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP162:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK226]], <2 x double> [[SPLAT_SPLAT228]], <2 x double> [[TMP160]])
+; CHECK-NEXT: [[TMP163:%.*]] = shufflevector <2 x double> [[TMP162]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP164:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP163]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK229:%.*]] = shufflevector <2 x double> [[COL_LOAD179]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP165:%.*]] = extractelement <4 x double> [[COL_LOAD192]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT230:%.*]] = insertelement <2 x double> poison, double [[TMP165]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT231:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT230]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP166:%.*]] = fmul contract <2 x double> [[BLOCK229]], [[SPLAT_SPLAT231]]
+; CHECK-NEXT: [[BLOCK232:%.*]] = shufflevector <2 x double> [[COL_LOAD181]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP167:%.*]] = extractelement <4 x double> [[COL_LOAD192]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT233:%.*]] = insertelement <2 x double> poison, double [[TMP167]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT234:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT233]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP168:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK232]], <2 x double> [[SPLAT_SPLAT234]], <2 x double> [[TMP166]])
+; CHECK-NEXT: [[BLOCK235:%.*]] = shufflevector <2 x double> [[COL_LOAD183]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP169:%.*]] = extractelement <4 x double> [[COL_LOAD192]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT236:%.*]] = insertelement <2 x double> poison, double [[TMP169]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT237:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT236]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP170:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK235]], <2 x double> [[SPLAT_SPLAT237]], <2 x double> [[TMP168]])
+; CHECK-NEXT: [[BLOCK238:%.*]] = shufflevector <2 x double> [[COL_LOAD185]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP171:%.*]] = extractelement <4 x double> [[COL_LOAD192]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT239:%.*]] = insertelement <2 x double> poison, double [[TMP171]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT240:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT239]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP172:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK238]], <2 x double> [[SPLAT_SPLAT240]], <2 x double> [[TMP170]])
+; CHECK-NEXT: [[TMP173:%.*]] = shufflevector <2 x double> [[TMP172]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP174:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP173]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP175:%.*]] = getelementptr double, ptr [[A]], i64 28
+; CHECK-NEXT: [[COL_LOAD241:%.*]] = load <2 x double>, ptr [[TMP175]], align 8
+; CHECK-NEXT: [[VEC_GEP242:%.*]] = getelementptr double, ptr [[TMP175]], i64 6
+; CHECK-NEXT: [[COL_LOAD243:%.*]] = load <2 x double>, ptr [[VEC_GEP242]], align 8
+; CHECK-NEXT: [[TMP176:%.*]] = getelementptr double, ptr [[B]], i64 4
+; CHECK-NEXT: [[COL_LOAD244:%.*]] = load <2 x double>, ptr [[TMP176]], align 8
+; CHECK-NEXT: [[VEC_GEP245:%.*]] = getelementptr double, ptr [[TMP176]], i64 6
+; CHECK-NEXT: [[COL_LOAD246:%.*]] = load <2 x double>, ptr [[VEC_GEP245]], align 8
+; CHECK-NEXT: [[VEC_GEP247:%.*]] = getelementptr double, ptr [[TMP176]], i64 12
+; CHECK-NEXT: [[COL_LOAD248:%.*]] = load <2 x double>, ptr [[VEC_GEP247]], align 8
+; CHECK-NEXT: [[VEC_GEP249:%.*]] = getelementptr double, ptr [[TMP176]], i64 18
+; CHECK-NEXT: [[COL_LOAD250:%.*]] = load <2 x double>, ptr [[VEC_GEP249]], align 8
+; CHECK-NEXT: [[BLOCK251:%.*]] = shufflevector <2 x double> [[TMP144]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK252:%.*]] = shufflevector <2 x double> [[COL_LOAD241]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP177:%.*]] = extractelement <2 x double> [[COL_LOAD244]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT253:%.*]] = insertelement <2 x double> poison, double [[TMP177]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT254:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT253]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP178:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK252]], <2 x double> [[SPLAT_SPLAT254]], <2 x double> [[BLOCK251]])
+; CHECK-NEXT: [[BLOCK255:%.*]] = shufflevector <2 x double> [[COL_LOAD243]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP179:%.*]] = extractelement <2 x double> [[COL_LOAD244]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT256:%.*]] = insertelement <2 x double> poison, double [[TMP179]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT257:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT256]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP180:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK255]], <2 x double> [[SPLAT_SPLAT257]], <2 x double> [[TMP178]])
+; CHECK-NEXT: [[TMP181:%.*]] = shufflevector <2 x double> [[TMP180]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP182:%.*]] = shufflevector <2 x double> [[TMP144]], <2 x double> [[TMP181]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK258:%.*]] = shufflevector <2 x double> [[TMP154]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK259:%.*]] = shufflevector <2 x double> [[COL_LOAD241]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP183:%.*]] = extractelement <2 x double> [[COL_LOAD246]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT260:%.*]] = insertelement <2 x double> poison, double [[TMP183]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT261:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT260]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP184:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK259]], <2 x double> [[SPLAT_SPLAT261]], <2 x double> [[BLOCK258]])
+; CHECK-NEXT: [[BLOCK262:%.*]] = shufflevector <2 x double> [[COL_LOAD243]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP185:%.*]] = extractelement <2 x double> [[COL_LOAD246]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT263:%.*]] = insertelement <2 x double> poison, double [[TMP185]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT264:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT263]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP186:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK262]], <2 x double> [[SPLAT_SPLAT264]], <2 x double> [[TMP184]])
+; CHECK-NEXT: [[TMP187:%.*]] = shufflevector <2 x double> [[TMP186]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP188:%.*]] = shufflevector <2 x double> [[TMP154]], <2 x double> [[TMP187]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK265:%.*]] = shufflevector <2 x double> [[TMP164]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK266:%.*]] = shufflevector <2 x double> [[COL_LOAD241]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP189:%.*]] = extractelement <2 x double> [[COL_LOAD248]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT267:%.*]] = insertelement <2 x double> poison, double [[TMP189]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT268:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT267]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP190:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK266]], <2 x double> [[SPLAT_SPLAT268]], <2 x double> [[BLOCK265]])
+; CHECK-NEXT: [[BLOCK269:%.*]] = shufflevector <2 x double> [[COL_LOAD243]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP191:%.*]] = extractelement <2 x double> [[COL_LOAD248]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT270:%.*]] = insertelement <2 x double> poison, double [[TMP191]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT271:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT270]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP192:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK269]], <2 x double> [[SPLAT_SPLAT271]], <2 x double> [[TMP190]])
+; CHECK-NEXT: [[TMP193:%.*]] = shufflevector <2 x double> [[TMP192]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP194:%.*]] = shufflevector <2 x double> [[TMP164]], <2 x double> [[TMP193]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK272:%.*]] = shufflevector <2 x double> [[TMP174]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK273:%.*]] = shufflevector <2 x double> [[COL_LOAD241]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP195:%.*]] = extractelement <2 x double> [[COL_LOAD250]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT274:%.*]] = insertelement <2 x double> poison, double [[TMP195]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT275:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT274]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP196:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK273]], <2 x double> [[SPLAT_SPLAT275]], <2 x double> [[BLOCK272]])
+; CHECK-NEXT: [[BLOCK276:%.*]] = shufflevector <2 x double> [[COL_LOAD243]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP197:%.*]] = extractelement <2 x double> [[COL_LOAD250]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT277:%.*]] = insertelement <2 x double> poison, double [[TMP197]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT278:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT277]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP198:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK276]], <2 x double> [[SPLAT_SPLAT278]], <2 x double> [[TMP196]])
+; CHECK-NEXT: [[TMP199:%.*]] = shufflevector <2 x double> [[TMP198]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP200:%.*]] = shufflevector <2 x double> [[TMP174]], <2 x double> [[TMP199]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP201:%.*]] = getelementptr double, ptr [[C]], i64 4
+; CHECK-NEXT: store <2 x double> [[TMP182]], ptr [[TMP201]], align 8
+; CHECK-NEXT: [[VEC_GEP279:%.*]] = getelementptr double, ptr [[TMP201]], i64 6
+; CHECK-NEXT: store <2 x double> [[TMP188]], ptr [[VEC_GEP279]], align 8
+; CHECK-NEXT: [[VEC_GEP280:%.*]] = getelementptr double, ptr [[TMP201]], i64 12
+; CHECK-NEXT: store <2 x double> [[TMP194]], ptr [[VEC_GEP280]], align 8
+; CHECK-NEXT: [[VEC_GEP281:%.*]] = getelementptr double, ptr [[TMP201]], i64 18
+; CHECK-NEXT: store <2 x double> [[TMP200]], ptr [[VEC_GEP281]], align 8
+; CHECK-NEXT: [[TMP202:%.*]] = getelementptr double, ptr [[A]], i64 0
+; CHECK-NEXT: [[COL_LOAD282:%.*]] = load <4 x double>, ptr [[TMP202]], align 8
+; CHECK-NEXT: [[VEC_GEP283:%.*]] = getelementptr double, ptr [[TMP202]], i64 6
+; CHECK-NEXT: [[COL_LOAD284:%.*]] = load <4 x double>, ptr [[VEC_GEP283]], align 8
+; CHECK-NEXT: [[VEC_GEP285:%.*]] = getelementptr double, ptr [[TMP202]], i64 12
+; CHECK-NEXT: [[COL_LOAD286:%.*]] = load <4 x double>, ptr [[VEC_GEP285]], align 8
+; CHECK-NEXT: [[VEC_GEP287:%.*]] = getelementptr double, ptr [[TMP202]], i64 18
+; CHECK-NEXT: [[COL_LOAD288:%.*]] = load <4 x double>, ptr [[VEC_GEP287]], align 8
+; CHECK-NEXT: [[TMP203:%.*]] = getelementptr double, ptr [[B]], i64 24
+; CHECK-NEXT: [[COL_LOAD289:%.*]] = load <4 x double>, ptr [[TMP203]], align 8
+; CHECK-NEXT: [[VEC_GEP290:%.*]] = getelementptr double, ptr [[TMP203]], i64 6
+; CHECK-NEXT: [[COL_LOAD291:%.*]] = load <4 x double>, ptr [[VEC_GEP290]], align 8
+; CHECK-NEXT: [[BLOCK292:%.*]] = shufflevector <4 x double> [[COL_LOAD282]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP204:%.*]] = extractelement <4 x double> [[COL_LOAD289]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT293:%.*]] = insertelement <2 x double> poison, double [[TMP204]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT294:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT293]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP205:%.*]] = fmul contract <2 x double> [[BLOCK292]], [[SPLAT_SPLAT294]]
+; CHECK-NEXT: [[BLOCK295:%.*]] = shufflevector <4 x double> [[COL_LOAD284]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP206:%.*]] = extractelement <4 x double> [[COL_LOAD289]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT296:%.*]] = insertelement <2 x double> poison, double [[TMP206]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT297:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT296]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP207:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK295]], <2 x double> [[SPLAT_SPLAT297]], <2 x double> [[TMP205]])
+; CHECK-NEXT: [[BLOCK298:%.*]] = shufflevector <4 x double> [[COL_LOAD286]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP208:%.*]] = extractelement <4 x double> [[COL_LOAD289]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT299:%.*]] = insertelement <2 x double> poison, double [[TMP208]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT300:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT299]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP209:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK298]], <2 x double> [[SPLAT_SPLAT300]], <2 x double> [[TMP207]])
+; CHECK-NEXT: [[BLOCK301:%.*]] = shufflevector <4 x double> [[COL_LOAD288]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP210:%.*]] = extractelement <4 x double> [[COL_LOAD289]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT302:%.*]] = insertelement <2 x double> poison, double [[TMP210]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT303:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT302]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP211:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK301]], <2 x double> [[SPLAT_SPLAT303]], <2 x double> [[TMP209]])
+; CHECK-NEXT: [[TMP212:%.*]] = shufflevector <2 x double> [[TMP211]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP213:%.*]] = shufflevector <4 x double> zeroinitializer, <4 x double> [[TMP212]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK304:%.*]] = shufflevector <4 x double> [[TMP213]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK305:%.*]] = shufflevector <4 x double> [[COL_LOAD282]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP214:%.*]] = extractelement <4 x double> [[COL_LOAD289]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT306:%.*]] = insertelement <2 x double> poison, double [[TMP214]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT307:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT306]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP215:%.*]] = fmul contract <2 x double> [[BLOCK305]], [[SPLAT_SPLAT307]]
+; CHECK-NEXT: [[BLOCK308:%.*]] = shufflevector <4 x double> [[COL_LOAD284]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP216:%.*]] = extractelement <4 x double> [[COL_LOAD289]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT309:%.*]] = insertelement <2 x double> poison, double [[TMP216]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT310:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT309]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP217:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK308]], <2 x double> [[SPLAT_SPLAT310]], <2 x double> [[TMP215]])
+; CHECK-NEXT: [[BLOCK311:%.*]] = shufflevector <4 x double> [[COL_LOAD286]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP218:%.*]] = extractelement <4 x double> [[COL_LOAD289]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT312:%.*]] = insertelement <2 x double> poison, double [[TMP218]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT313:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT312]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP219:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK311]], <2 x double> [[SPLAT_SPLAT313]], <2 x double> [[TMP217]])
+; CHECK-NEXT: [[BLOCK314:%.*]] = shufflevector <4 x double> [[COL_LOAD288]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP220:%.*]] = extractelement <4 x double> [[COL_LOAD289]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT315:%.*]] = insertelement <2 x double> poison, double [[TMP220]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT316:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT315]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP221:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK314]], <2 x double> [[SPLAT_SPLAT316]], <2 x double> [[TMP219]])
+; CHECK-NEXT: [[TMP222:%.*]] = shufflevector <2 x double> [[TMP221]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP223:%.*]] = shufflevector <4 x double> [[TMP213]], <4 x double> [[TMP222]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[BLOCK317:%.*]] = shufflevector <4 x double> [[COL_LOAD282]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP224:%.*]] = extractelement <4 x double> [[COL_LOAD291]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT318:%.*]] = insertelement <2 x double> poison, double [[TMP224]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT319:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT318]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP225:%.*]] = fmul contract <2 x double> [[BLOCK317]], [[SPLAT_SPLAT319]]
+; CHECK-NEXT: [[BLOCK320:%.*]] = shufflevector <4 x double> [[COL_LOAD284]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP226:%.*]] = extractelement <4 x double> [[COL_LOAD291]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT321:%.*]] = insertelement <2 x double> poison, double [[TMP226]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT322:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT321]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP227:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK320]], <2 x double> [[SPLAT_SPLAT322]], <2 x double> [[TMP225]])
+; CHECK-NEXT: [[BLOCK323:%.*]] = shufflevector <4 x double> [[COL_LOAD286]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP228:%.*]] = extractelement <4 x double> [[COL_LOAD291]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT324:%.*]] = insertelement <2 x double> poison, double [[TMP228]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT325:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT324]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP229:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK323]], <2 x double> [[SPLAT_SPLAT325]], <2 x double> [[TMP227]])
+; CHECK-NEXT: [[BLOCK326:%.*]] = shufflevector <4 x double> [[COL_LOAD288]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP230:%.*]] = extractelement <4 x double> [[COL_LOAD291]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT327:%.*]] = insertelement <2 x double> poison, double [[TMP230]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT328:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT327]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP231:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK326]], <2 x double> [[SPLAT_SPLAT328]], <2 x double> [[TMP229]])
+; CHECK-NEXT: [[TMP232:%.*]] = shufflevector <2 x double> [[TMP231]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP233:%.*]] = shufflevector <4 x double> zeroinitializer, <4 x double> [[TMP232]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK329:%.*]] = shufflevector <4 x double> [[TMP233]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK330:%.*]] = shufflevector <4 x double> [[COL_LOAD282]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP234:%.*]] = extractelement <4 x double> [[COL_LOAD291]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT331:%.*]] = insertelement <2 x double> poison, double [[TMP234]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT332:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT331]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP235:%.*]] = fmul contract <2 x double> [[BLOCK330]], [[SPLAT_SPLAT332]]
+; CHECK-NEXT: [[BLOCK333:%.*]] = shufflevector <4 x double> [[COL_LOAD284]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP236:%.*]] = extractelement <4 x double> [[COL_LOAD291]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT334:%.*]] = insertelement <2 x double> poison, double [[TMP236]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT335:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT334]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP237:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK333]], <2 x double> [[SPLAT_SPLAT335]], <2 x double> [[TMP235]])
+; CHECK-NEXT: [[BLOCK336:%.*]] = shufflevector <4 x double> [[COL_LOAD286]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP238:%.*]] = extractelement <4 x double> [[COL_LOAD291]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT337:%.*]] = insertelement <2 x double> poison, double [[TMP238]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT338:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT337]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP239:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK336]], <2 x double> [[SPLAT_SPLAT338]], <2 x double> [[TMP237]])
+; CHECK-NEXT: [[BLOCK339:%.*]] = shufflevector <4 x double> [[COL_LOAD288]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP240:%.*]] = extractelement <4 x double> [[COL_LOAD291]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT340:%.*]] = insertelement <2 x double> poison, double [[TMP240]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT341:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT340]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP241:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK339]], <2 x double> [[SPLAT_SPLAT341]], <2 x double> [[TMP239]])
+; CHECK-NEXT: [[TMP242:%.*]] = shufflevector <2 x double> [[TMP241]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP243:%.*]] = shufflevector <4 x double> [[TMP233]], <4 x double> [[TMP242]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[TMP244:%.*]] = getelementptr double, ptr [[A]], i64 24
+; CHECK-NEXT: [[COL_LOAD342:%.*]] = load <4 x double>, ptr [[TMP244]], align 8
+; CHECK-NEXT: [[VEC_GEP343:%.*]] = getelementptr double, ptr [[TMP244]], i64 6
+; CHECK-NEXT: [[COL_LOAD344:%.*]] = load <4 x double>, ptr [[VEC_GEP343]], align 8
+; CHECK-NEXT: [[TMP245:%.*]] = getelementptr double, ptr [[B]], i64 28
+; CHECK-NEXT: [[COL_LOAD345:%.*]] = load <2 x double>, ptr [[TMP245]], align 8
+; CHECK-NEXT: [[VEC_GEP346:%.*]] = getelementptr double, ptr [[TMP245]], i64 6
+; CHECK-NEXT: [[COL_LOAD347:%.*]] = load <2 x double>, ptr [[VEC_GEP346]], align 8
+; CHECK-NEXT: [[BLOCK348:%.*]] = shufflevector <4 x double> [[TMP223]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK349:%.*]] = shufflevector <4 x double> [[COL_LOAD342]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP246:%.*]] = extractelement <2 x double> [[COL_LOAD345]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT350:%.*]] = insertelement <2 x double> poison, double [[TMP246]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT351:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT350]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP247:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK349]], <2 x double> [[SPLAT_SPLAT351]], <2 x double> [[BLOCK348]])
+; CHECK-NEXT: [[BLOCK352:%.*]] = shufflevector <4 x double> [[COL_LOAD344]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP248:%.*]] = extractelement <2 x double> [[COL_LOAD345]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT353:%.*]] = insertelement <2 x double> poison, double [[TMP248]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT354:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT353]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP249:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK352]], <2 x double> [[SPLAT_SPLAT354]], <2 x double> [[TMP247]])
+; CHECK-NEXT: [[TMP250:%.*]] = shufflevector <2 x double> [[TMP249]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP251:%.*]] = shufflevector <4 x double> [[TMP223]], <4 x double> [[TMP250]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK355:%.*]] = shufflevector <4 x double> [[TMP251]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK356:%.*]] = shufflevector <4 x double> [[COL_LOAD342]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP252:%.*]] = extractelement <2 x double> [[COL_LOAD345]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT357:%.*]] = insertelement <2 x double> poison, double [[TMP252]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT358:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT357]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP253:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK356]], <2 x double> [[SPLAT_SPLAT358]], <2 x double> [[BLOCK355]])
+; CHECK-NEXT: [[BLOCK359:%.*]] = shufflevector <4 x double> [[COL_LOAD344]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP254:%.*]] = extractelement <2 x double> [[COL_LOAD345]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT360:%.*]] = insertelement <2 x double> poison, double [[TMP254]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT361:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT360]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP255:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK359]], <2 x double> [[SPLAT_SPLAT361]], <2 x double> [[TMP253]])
+; CHECK-NEXT: [[TMP256:%.*]] = shufflevector <2 x double> [[TMP255]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP257:%.*]] = shufflevector <4 x double> [[TMP251]], <4 x double> [[TMP256]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[BLOCK362:%.*]] = shufflevector <4 x double> [[TMP243]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK363:%.*]] = shufflevector <4 x double> [[COL_LOAD342]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP258:%.*]] = extractelement <2 x double> [[COL_LOAD347]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT364:%.*]] = insertelement <2 x double> poison, double [[TMP258]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT365:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT364]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP259:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK363]], <2 x double> [[SPLAT_SPLAT365]], <2 x double> [[BLOCK362]])
+; CHECK-NEXT: [[BLOCK366:%.*]] = shufflevector <4 x double> [[COL_LOAD344]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP260:%.*]] = extractelement <2 x double> [[COL_LOAD347]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT367:%.*]] = insertelement <2 x double> poison, double [[TMP260]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT368:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT367]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP261:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK366]], <2 x double> [[SPLAT_SPLAT368]], <2 x double> [[TMP259]])
+; CHECK-NEXT: [[TMP262:%.*]] = shufflevector <2 x double> [[TMP261]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP263:%.*]] = shufflevector <4 x double> [[TMP243]], <4 x double> [[TMP262]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK369:%.*]] = shufflevector <4 x double> [[TMP263]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK370:%.*]] = shufflevector <4 x double> [[COL_LOAD342]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP264:%.*]] = extractelement <2 x double> [[COL_LOAD347]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT371:%.*]] = insertelement <2 x double> poison, double [[TMP264]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT372:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT371]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP265:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK370]], <2 x double> [[SPLAT_SPLAT372]], <2 x double> [[BLOCK369]])
+; CHECK-NEXT: [[BLOCK373:%.*]] = shufflevector <4 x double> [[COL_LOAD344]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP266:%.*]] = extractelement <2 x double> [[COL_LOAD347]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT374:%.*]] = insertelement <2 x double> poison, double [[TMP266]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT375:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT374]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP267:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK373]], <2 x double> [[SPLAT_SPLAT375]], <2 x double> [[TMP265]])
+; CHECK-NEXT: [[TMP268:%.*]] = shufflevector <2 x double> [[TMP267]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP269:%.*]] = shufflevector <4 x double> [[TMP263]], <4 x double> [[TMP268]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[TMP270:%.*]] = getelementptr double, ptr [[C]], i64 24
+; CHECK-NEXT: store <4 x double> [[TMP257]], ptr [[TMP270]], align 8
+; CHECK-NEXT: [[VEC_GEP376:%.*]] = getelementptr double, ptr [[TMP270]], i64 6
+; CHECK-NEXT: store <4 x double> [[TMP269]], ptr [[VEC_GEP376]], align 8
+; CHECK-NEXT: [[TMP271:%.*]] = getelementptr double, ptr [[A]], i64 4
+; CHECK-NEXT: [[COL_LOAD377:%.*]] = load <2 x double>, ptr [[TMP271]], align 8
+; CHECK-NEXT: [[VEC_GEP378:%.*]] = getelementptr double, ptr [[TMP271]], i64 6
+; CHECK-NEXT: [[COL_LOAD379:%.*]] = load <2 x double>, ptr [[VEC_GEP378]], align 8
+; CHECK-NEXT: [[VEC_GEP380:%.*]] = getelementptr double, ptr [[TMP271]], i64 12
+; CHECK-NEXT: [[COL_LOAD381:%.*]] = load <2 x double>, ptr [[VEC_GEP380]], align 8
+; CHECK-NEXT: [[VEC_GEP382:%.*]] = getelementptr double, ptr [[TMP271]], i64 18
+; CHECK-NEXT: [[COL_LOAD383:%.*]] = load <2 x double>, ptr [[VEC_GEP382]], align 8
+; CHECK-NEXT: [[TMP272:%.*]] = getelementptr double, ptr [[B]], i64 24
+; CHECK-NEXT: [[COL_LOAD384:%.*]] = load <4 x double>, ptr [[TMP272]], align 8
+; CHECK-NEXT: [[VEC_GEP385:%.*]] = getelementptr double, ptr [[TMP272]], i64 6
+; CHECK-NEXT: [[COL_LOAD386:%.*]] = load <4 x double>, ptr [[VEC_GEP385]], align 8
+; CHECK-NEXT: [[BLOCK387:%.*]] = shufflevector <2 x double> [[COL_LOAD377]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP273:%.*]] = extractelement <4 x double> [[COL_LOAD384]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT388:%.*]] = insertelement <2 x double> poison, double [[TMP273]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT389:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT388]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP274:%.*]] = fmul contract <2 x double> [[BLOCK387]], [[SPLAT_SPLAT389]]
+; CHECK-NEXT: [[BLOCK390:%.*]] = shufflevector <2 x double> [[COL_LOAD379]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP275:%.*]] = extractelement <4 x double> [[COL_LOAD384]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT391:%.*]] = insertelement <2 x double> poison, double [[TMP275]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT392:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT391]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP276:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK390]], <2 x double> [[SPLAT_SPLAT392]], <2 x double> [[TMP274]])
+; CHECK-NEXT: [[BLOCK393:%.*]] = shufflevector <2 x double> [[COL_LOAD381]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP277:%.*]] = extractelement <4 x double> [[COL_LOAD384]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT394:%.*]] = insertelement <2 x double> poison, double [[TMP277]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT395:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT394]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP278:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK393]], <2 x double> [[SPLAT_SPLAT395]], <2 x double> [[TMP276]])
+; CHECK-NEXT: [[BLOCK396:%.*]] = shufflevector <2 x double> [[COL_LOAD383]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP279:%.*]] = extractelement <4 x double> [[COL_LOAD384]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT397:%.*]] = insertelement <2 x double> poison, double [[TMP279]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT398:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT397]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP280:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK396]], <2 x double> [[SPLAT_SPLAT398]], <2 x double> [[TMP278]])
+; CHECK-NEXT: [[TMP281:%.*]] = shufflevector <2 x double> [[TMP280]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP282:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP281]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK399:%.*]] = shufflevector <2 x double> [[COL_LOAD377]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP283:%.*]] = extractelement <4 x double> [[COL_LOAD386]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT400:%.*]] = insertelement <2 x double> poison, double [[TMP283]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT401:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT400]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP284:%.*]] = fmul contract <2 x double> [[BLOCK399]], [[SPLAT_SPLAT401]]
+; CHECK-NEXT: [[BLOCK402:%.*]] = shufflevector <2 x double> [[COL_LOAD379]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP285:%.*]] = extractelement <4 x double> [[COL_LOAD386]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT403:%.*]] = insertelement <2 x double> poison, double [[TMP285]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT404:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT403]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP286:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK402]], <2 x double> [[SPLAT_SPLAT404]], <2 x double> [[TMP284]])
+; CHECK-NEXT: [[BLOCK405:%.*]] = shufflevector <2 x double> [[COL_LOAD381]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP287:%.*]] = extractelement <4 x double> [[COL_LOAD386]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT406:%.*]] = insertelement <2 x double> poison, double [[TMP287]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT407:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT406]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP288:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK405]], <2 x double> [[SPLAT_SPLAT407]], <2 x double> [[TMP286]])
+; CHECK-NEXT: [[BLOCK408:%.*]] = shufflevector <2 x double> [[COL_LOAD383]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP289:%.*]] = extractelement <4 x double> [[COL_LOAD386]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT409:%.*]] = insertelement <2 x double> poison, double [[TMP289]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT410:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT409]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP290:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK408]], <2 x double> [[SPLAT_SPLAT410]], <2 x double> [[TMP288]])
+; CHECK-NEXT: [[TMP291:%.*]] = shufflevector <2 x double> [[TMP290]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP292:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP291]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP293:%.*]] = getelementptr double, ptr [[A]], i64 28
+; CHECK-NEXT: [[COL_LOAD411:%.*]] = load <2 x double>, ptr [[TMP293]], align 8
+; CHECK-NEXT: [[VEC_GEP412:%.*]] = getelementptr double, ptr [[TMP293]], i64 6
+; CHECK-NEXT: [[COL_LOAD413:%.*]] = load <2 x double>, ptr [[VEC_GEP412]], align 8
+; CHECK-NEXT: [[TMP294:%.*]] = getelementptr double, ptr [[B]], i64 28
+; CHECK-NEXT: [[COL_LOAD414:%.*]] = load <2 x double>, ptr [[TMP294]], align 8
+; CHECK-NEXT: [[VEC_GEP415:%.*]] = getelementptr double, ptr [[TMP294]], i64 6
+; CHECK-NEXT: [[COL_LOAD416:%.*]] = load <2 x double>, ptr [[VEC_GEP415]], align 8
+; CHECK-NEXT: [[BLOCK417:%.*]] = shufflevector <2 x double> [[TMP282]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK418:%.*]] = shufflevector <2 x double> [[COL_LOAD411]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP295:%.*]] = extractelement <2 x double> [[COL_LOAD414]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT419:%.*]] = insertelement <2 x double> poison, double [[TMP295]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT420:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT419]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP296:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK418]], <2 x double> [[SPLAT_SPLAT420]], <2 x double> [[BLOCK417]])
+; CHECK-NEXT: [[BLOCK421:%.*]] = shufflevector <2 x double> [[COL_LOAD413]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP297:%.*]] = extractelement <2 x double> [[COL_LOAD414]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT422:%.*]] = insertelement <2 x double> poison, double [[TMP297]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT423:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT422]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP298:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK421]], <2 x double> [[SPLAT_SPLAT423]], <2 x double> [[TMP296]])
+; CHECK-NEXT: [[TMP299:%.*]] = shufflevector <2 x double> [[TMP298]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP300:%.*]] = shufflevector <2 x double> [[TMP282]], <2 x double> [[TMP299]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK424:%.*]] = shufflevector <2 x double> [[TMP292]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK425:%.*]] = shufflevector <2 x double> [[COL_LOAD411]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP301:%.*]] = extractelement <2 x double> [[COL_LOAD416]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT426:%.*]] = insertelement <2 x double> poison, double [[TMP301]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT427:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT426]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP302:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK425]], <2 x double> [[SPLAT_SPLAT427]], <2 x double> [[BLOCK424]])
+; CHECK-NEXT: [[BLOCK428:%.*]] = shufflevector <2 x double> [[COL_LOAD413]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP303:%.*]] = extractelement <2 x double> [[COL_LOAD416]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT429:%.*]] = insertelement <2 x double> poison, double [[TMP303]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT430:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT429]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP304:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK428]], <2 x double> [[SPLAT_SPLAT430]], <2 x double> [[TMP302]])
+; CHECK-NEXT: [[TMP305:%.*]] = shufflevector <2 x double> [[TMP304]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP306:%.*]] = shufflevector <2 x double> [[TMP292]], <2 x double> [[TMP305]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP307:%.*]] = getelementptr double, ptr [[C]], i64 28
+; CHECK-NEXT: store <2 x double> [[TMP300]], ptr [[TMP307]], align 8
+; CHECK-NEXT: [[VEC_GEP431:%.*]] = getelementptr double, ptr [[TMP307]], i64 6
+; CHECK-NEXT: store <2 x double> [[TMP306]], ptr [[VEC_GEP431]], align 8
+; CHECK-NEXT: ret void
+;
+entry:
+ %a = load <36 x double>, ptr %A, align 8
+ %b = load <36 x double>, ptr %B, align 8
+ %c = call <36 x double> @llvm.matrix.multiply.v36f64.v36f64.v36f64(<36 x double> %a, <36 x double> %b, i32 6, i32 6, i32 6)
+ store <36 x double> %c, ptr %C, align 8
+ ret void
+}
+
+define void @multiply_7x7x7(ptr noalias %A, ptr noalias %B, ptr noalias %C) {
+; CHECK-LABEL: define void @multiply_7x7x7(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
+; CHECK-NEXT: [[COL_LOAD:%.*]] = load <4 x double>, ptr [[TMP0]], align 8
+; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 7
+; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <4 x double>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[TMP0]], i64 14
+; CHECK-NEXT: [[COL_LOAD3:%.*]] = load <4 x double>, ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT: [[VEC_GEP4:%.*]] = getelementptr double, ptr [[TMP0]], i64 21
+; CHECK-NEXT: [[COL_LOAD5:%.*]] = load <4 x double>, ptr [[VEC_GEP4]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0
+; CHECK-NEXT: [[COL_LOAD6:%.*]] = load <4 x double>, ptr [[TMP1]], align 8
+; CHECK-NEXT: [[VEC_GEP7:%.*]] = getelementptr double, ptr [[TMP1]], i64 7
+; CHECK-NEXT: [[COL_LOAD8:%.*]] = load <4 x double>, ptr [[VEC_GEP7]], align 8
+; CHECK-NEXT: [[VEC_GEP9:%.*]] = getelementptr double, ptr [[TMP1]], i64 14
+; CHECK-NEXT: [[COL_LOAD10:%.*]] = load <4 x double>, ptr [[VEC_GEP9]], align 8
+; CHECK-NEXT: [[VEC_GEP11:%.*]] = getelementptr double, ptr [[TMP1]], i64 21
+; CHECK-NEXT: [[COL_LOAD12:%.*]] = load <4 x double>, ptr [[VEC_GEP11]], align 8
+; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <4 x double> [[COL_LOAD]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x double> [[COL_LOAD6]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = fmul contract <2 x double> [[BLOCK]], [[SPLAT_SPLAT]]
+; CHECK-NEXT: [[BLOCK13:%.*]] = shufflevector <4 x double> [[COL_LOAD1]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[COL_LOAD6]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT14:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT15:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT14]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK13]], <2 x double> [[SPLAT_SPLAT15]], <2 x double> [[TMP3]])
+; CHECK-NEXT: [[BLOCK16:%.*]] = shufflevector <4 x double> [[COL_LOAD3]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x double> [[COL_LOAD6]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT17:%.*]] = insertelement <2 x double> poison, double [[TMP6]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT18:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT17]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK16]], <2 x double> [[SPLAT_SPLAT18]], <2 x double> [[TMP5]])
+; CHECK-NEXT: [[BLOCK19:%.*]] = shufflevector <4 x double> [[COL_LOAD5]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x double> [[COL_LOAD6]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT20:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT21:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT20]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK19]], <2 x double> [[SPLAT_SPLAT21]], <2 x double> [[TMP7]])
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x double> zeroinitializer, <4 x double> [[TMP10]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK23:%.*]] = shufflevector <4 x double> [[COL_LOAD]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x double> [[COL_LOAD6]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT24:%.*]] = insertelement <2 x double> poison, double [[TMP12]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT25:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT24]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = fmul contract <2 x double> [[BLOCK23]], [[SPLAT_SPLAT25]]
+; CHECK-NEXT: [[BLOCK26:%.*]] = shufflevector <4 x double> [[COL_LOAD1]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x double> [[COL_LOAD6]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT27:%.*]] = insertelement <2 x double> poison, double [[TMP14]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT28:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT27]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK26]], <2 x double> [[SPLAT_SPLAT28]], <2 x double> [[TMP13]])
+; CHECK-NEXT: [[BLOCK29:%.*]] = shufflevector <4 x double> [[COL_LOAD3]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x double> [[COL_LOAD6]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT30:%.*]] = insertelement <2 x double> poison, double [[TMP16]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT31:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT30]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK29]], <2 x double> [[SPLAT_SPLAT31]], <2 x double> [[TMP15]])
+; CHECK-NEXT: [[BLOCK32:%.*]] = shufflevector <4 x double> [[COL_LOAD5]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x double> [[COL_LOAD6]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT33:%.*]] = insertelement <2 x double> poison, double [[TMP18]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT34:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT33]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP19:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK32]], <2 x double> [[SPLAT_SPLAT34]], <2 x double> [[TMP17]])
+; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[BLOCK35:%.*]] = shufflevector <4 x double> [[COL_LOAD]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x double> [[COL_LOAD8]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT36:%.*]] = insertelement <2 x double> poison, double [[TMP22]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT37:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT36]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP23:%.*]] = fmul contract <2 x double> [[BLOCK35]], [[SPLAT_SPLAT37]]
+; CHECK-NEXT: [[BLOCK38:%.*]] = shufflevector <4 x double> [[COL_LOAD1]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x double> [[COL_LOAD8]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT39:%.*]] = insertelement <2 x double> poison, double [[TMP24]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT40:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT39]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK38]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP23]])
+; CHECK-NEXT: [[BLOCK41:%.*]] = shufflevector <4 x double> [[COL_LOAD3]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x double> [[COL_LOAD8]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT42:%.*]] = insertelement <2 x double> poison, double [[TMP26]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT43:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT42]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK41]], <2 x double> [[SPLAT_SPLAT43]], <2 x double> [[TMP25]])
+; CHECK-NEXT: [[BLOCK44:%.*]] = shufflevector <4 x double> [[COL_LOAD5]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x double> [[COL_LOAD8]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT45:%.*]] = insertelement <2 x double> poison, double [[TMP28]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT46:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT45]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP29:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK44]], <2 x double> [[SPLAT_SPLAT46]], <2 x double> [[TMP27]])
+; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <2 x double> [[TMP29]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x double> zeroinitializer, <4 x double> [[TMP30]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK47:%.*]] = shufflevector <4 x double> [[TMP31]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK48:%.*]] = shufflevector <4 x double> [[COL_LOAD]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x double> [[COL_LOAD8]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT49:%.*]] = insertelement <2 x double> poison, double [[TMP32]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT49]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP33:%.*]] = fmul contract <2 x double> [[BLOCK48]], [[SPLAT_SPLAT50]]
+; CHECK-NEXT: [[BLOCK51:%.*]] = shufflevector <4 x double> [[COL_LOAD1]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x double> [[COL_LOAD8]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT52:%.*]] = insertelement <2 x double> poison, double [[TMP34]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT53:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT52]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP35:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK51]], <2 x double> [[SPLAT_SPLAT53]], <2 x double> [[TMP33]])
+; CHECK-NEXT: [[BLOCK54:%.*]] = shufflevector <4 x double> [[COL_LOAD3]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x double> [[COL_LOAD8]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT55:%.*]] = insertelement <2 x double> poison, double [[TMP36]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT56:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT55]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK54]], <2 x double> [[SPLAT_SPLAT56]], <2 x double> [[TMP35]])
+; CHECK-NEXT: [[BLOCK57:%.*]] = shufflevector <4 x double> [[COL_LOAD5]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x double> [[COL_LOAD8]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT58:%.*]] = insertelement <2 x double> poison, double [[TMP38]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT59:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT58]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK57]], <2 x double> [[SPLAT_SPLAT59]], <2 x double> [[TMP37]])
+; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <2 x double> [[TMP39]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <4 x double> [[TMP31]], <4 x double> [[TMP40]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[BLOCK60:%.*]] = shufflevector <4 x double> [[COL_LOAD]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x double> [[COL_LOAD10]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT61:%.*]] = insertelement <2 x double> poison, double [[TMP42]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT62:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT61]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP43:%.*]] = fmul contract <2 x double> [[BLOCK60]], [[SPLAT_SPLAT62]]
+; CHECK-NEXT: [[BLOCK63:%.*]] = shufflevector <4 x double> [[COL_LOAD1]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x double> [[COL_LOAD10]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT64:%.*]] = insertelement <2 x double> poison, double [[TMP44]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT65:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT64]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP45:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK63]], <2 x double> [[SPLAT_SPLAT65]], <2 x double> [[TMP43]])
+; CHECK-NEXT: [[BLOCK66:%.*]] = shufflevector <4 x double> [[COL_LOAD3]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x double> [[COL_LOAD10]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT67:%.*]] = insertelement <2 x double> poison, double [[TMP46]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT68:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT67]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK66]], <2 x double> [[SPLAT_SPLAT68]], <2 x double> [[TMP45]])
+; CHECK-NEXT: [[BLOCK69:%.*]] = shufflevector <4 x double> [[COL_LOAD5]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP48:%.*]] = extractelement <4 x double> [[COL_LOAD10]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT70:%.*]] = insertelement <2 x double> poison, double [[TMP48]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT71:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT70]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP49:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK69]], <2 x double> [[SPLAT_SPLAT71]], <2 x double> [[TMP47]])
+; CHECK-NEXT: [[TMP50:%.*]] = shufflevector <2 x double> [[TMP49]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <4 x double> zeroinitializer, <4 x double> [[TMP50]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK72:%.*]] = shufflevector <4 x double> [[TMP51]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK73:%.*]] = shufflevector <4 x double> [[COL_LOAD]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP52:%.*]] = extractelement <4 x double> [[COL_LOAD10]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT74:%.*]] = insertelement <2 x double> poison, double [[TMP52]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT75:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT74]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP53:%.*]] = fmul contract <2 x double> [[BLOCK73]], [[SPLAT_SPLAT75]]
+; CHECK-NEXT: [[BLOCK76:%.*]] = shufflevector <4 x double> [[COL_LOAD1]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x double> [[COL_LOAD10]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT77:%.*]] = insertelement <2 x double> poison, double [[TMP54]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT78:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT77]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP55:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK76]], <2 x double> [[SPLAT_SPLAT78]], <2 x double> [[TMP53]])
+; CHECK-NEXT: [[BLOCK79:%.*]] = shufflevector <4 x double> [[COL_LOAD3]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP56:%.*]] = extractelement <4 x double> [[COL_LOAD10]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT80:%.*]] = insertelement <2 x double> poison, double [[TMP56]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT81:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT80]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP57:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK79]], <2 x double> [[SPLAT_SPLAT81]], <2 x double> [[TMP55]])
+; CHECK-NEXT: [[BLOCK82:%.*]] = shufflevector <4 x double> [[COL_LOAD5]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP58:%.*]] = extractelement <4 x double> [[COL_LOAD10]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT83:%.*]] = insertelement <2 x double> poison, double [[TMP58]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT84:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT83]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP59:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK82]], <2 x double> [[SPLAT_SPLAT84]], <2 x double> [[TMP57]])
+; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <2 x double> [[TMP59]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <4 x double> [[TMP51]], <4 x double> [[TMP60]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[BLOCK85:%.*]] = shufflevector <4 x double> [[COL_LOAD]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x double> [[COL_LOAD12]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT86:%.*]] = insertelement <2 x double> poison, double [[TMP62]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT87:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT86]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP63:%.*]] = fmul contract <2 x double> [[BLOCK85]], [[SPLAT_SPLAT87]]
+; CHECK-NEXT: [[BLOCK88:%.*]] = shufflevector <4 x double> [[COL_LOAD1]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP64:%.*]] = extractelement <4 x double> [[COL_LOAD12]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT89:%.*]] = insertelement <2 x double> poison, double [[TMP64]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT90:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT89]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP65:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK88]], <2 x double> [[SPLAT_SPLAT90]], <2 x double> [[TMP63]])
+; CHECK-NEXT: [[BLOCK91:%.*]] = shufflevector <4 x double> [[COL_LOAD3]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP66:%.*]] = extractelement <4 x double> [[COL_LOAD12]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT92:%.*]] = insertelement <2 x double> poison, double [[TMP66]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT93:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT92]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP67:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK91]], <2 x double> [[SPLAT_SPLAT93]], <2 x double> [[TMP65]])
+; CHECK-NEXT: [[BLOCK94:%.*]] = shufflevector <4 x double> [[COL_LOAD5]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x double> [[COL_LOAD12]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT95:%.*]] = insertelement <2 x double> poison, double [[TMP68]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT96:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT95]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP69:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK94]], <2 x double> [[SPLAT_SPLAT96]], <2 x double> [[TMP67]])
+; CHECK-NEXT: [[TMP70:%.*]] = shufflevector <2 x double> [[TMP69]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP71:%.*]] = shufflevector <4 x double> zeroinitializer, <4 x double> [[TMP70]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK97:%.*]] = shufflevector <4 x double> [[TMP71]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK98:%.*]] = shufflevector <4 x double> [[COL_LOAD]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP72:%.*]] = extractelement <4 x double> [[COL_LOAD12]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT99:%.*]] = insertelement <2 x double> poison, double [[TMP72]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT100:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT99]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP73:%.*]] = fmul contract <2 x double> [[BLOCK98]], [[SPLAT_SPLAT100]]
+; CHECK-NEXT: [[BLOCK101:%.*]] = shufflevector <4 x double> [[COL_LOAD1]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x double> [[COL_LOAD12]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT102:%.*]] = insertelement <2 x double> poison, double [[TMP74]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT103:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT102]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP75:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK101]], <2 x double> [[SPLAT_SPLAT103]], <2 x double> [[TMP73]])
+; CHECK-NEXT: [[BLOCK104:%.*]] = shufflevector <4 x double> [[COL_LOAD3]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP76:%.*]] = extractelement <4 x double> [[COL_LOAD12]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT105:%.*]] = insertelement <2 x double> poison, double [[TMP76]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT106:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT105]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP77:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK104]], <2 x double> [[SPLAT_SPLAT106]], <2 x double> [[TMP75]])
+; CHECK-NEXT: [[BLOCK107:%.*]] = shufflevector <4 x double> [[COL_LOAD5]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP78:%.*]] = extractelement <4 x double> [[COL_LOAD12]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT108:%.*]] = insertelement <2 x double> poison, double [[TMP78]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT109:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT108]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP79:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK107]], <2 x double> [[SPLAT_SPLAT109]], <2 x double> [[TMP77]])
+; CHECK-NEXT: [[TMP80:%.*]] = shufflevector <2 x double> [[TMP79]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP81:%.*]] = shufflevector <4 x double> [[TMP71]], <4 x double> [[TMP80]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[TMP82:%.*]] = getelementptr double, ptr [[A]], i64 28
+; CHECK-NEXT: [[COL_LOAD110:%.*]] = load <4 x double>, ptr [[TMP82]], align 8
+; CHECK-NEXT: [[VEC_GEP111:%.*]] = getelementptr double, ptr [[TMP82]], i64 7
+; CHECK-NEXT: [[COL_LOAD112:%.*]] = load <4 x double>, ptr [[VEC_GEP111]], align 8
+; CHECK-NEXT: [[VEC_GEP113:%.*]] = getelementptr double, ptr [[TMP82]], i64 14
+; CHECK-NEXT: [[COL_LOAD114:%.*]] = load <4 x double>, ptr [[VEC_GEP113]], align 8
+; CHECK-NEXT: [[TMP83:%.*]] = getelementptr double, ptr [[B]], i64 4
+; CHECK-NEXT: [[COL_LOAD115:%.*]] = load <3 x double>, ptr [[TMP83]], align 8
+; CHECK-NEXT: [[VEC_GEP116:%.*]] = getelementptr double, ptr [[TMP83]], i64 7
+; CHECK-NEXT: [[COL_LOAD117:%.*]] = load <3 x double>, ptr [[VEC_GEP116]], align 8
+; CHECK-NEXT: [[VEC_GEP118:%.*]] = getelementptr double, ptr [[TMP83]], i64 14
+; CHECK-NEXT: [[COL_LOAD119:%.*]] = load <3 x double>, ptr [[VEC_GEP118]], align 8
+; CHECK-NEXT: [[VEC_GEP120:%.*]] = getelementptr double, ptr [[TMP83]], i64 21
+; CHECK-NEXT: [[COL_LOAD121:%.*]] = load <3 x double>, ptr [[VEC_GEP120]], align 8
+; CHECK-NEXT: [[BLOCK122:%.*]] = shufflevector <4 x double> [[TMP21]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK123:%.*]] = shufflevector <4 x double> [[COL_LOAD110]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP84:%.*]] = extractelement <3 x double> [[COL_LOAD115]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT124:%.*]] = insertelement <2 x double> poison, double [[TMP84]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT125:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT124]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP85:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK123]], <2 x double> [[SPLAT_SPLAT125]], <2 x double> [[BLOCK122]])
+; CHECK-NEXT: [[BLOCK126:%.*]] = shufflevector <4 x double> [[COL_LOAD112]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP86:%.*]] = extractelement <3 x double> [[COL_LOAD115]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT127:%.*]] = insertelement <2 x double> poison, double [[TMP86]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT128:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT127]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP87:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK126]], <2 x double> [[SPLAT_SPLAT128]], <2 x double> [[TMP85]])
+; CHECK-NEXT: [[BLOCK129:%.*]] = shufflevector <4 x double> [[COL_LOAD114]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP88:%.*]] = extractelement <3 x double> [[COL_LOAD115]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT130:%.*]] = insertelement <2 x double> poison, double [[TMP88]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT131:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT130]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP89:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK129]], <2 x double> [[SPLAT_SPLAT131]], <2 x double> [[TMP87]])
+; CHECK-NEXT: [[TMP90:%.*]] = shufflevector <2 x double> [[TMP89]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP91:%.*]] = shufflevector <4 x double> [[TMP21]], <4 x double> [[TMP90]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK132:%.*]] = shufflevector <4 x double> [[TMP91]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK133:%.*]] = shufflevector <4 x double> [[COL_LOAD110]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP92:%.*]] = extractelement <3 x double> [[COL_LOAD115]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT134:%.*]] = insertelement <2 x double> poison, double [[TMP92]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT135:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT134]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP93:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK133]], <2 x double> [[SPLAT_SPLAT135]], <2 x double> [[BLOCK132]])
+; CHECK-NEXT: [[BLOCK136:%.*]] = shufflevector <4 x double> [[COL_LOAD112]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP94:%.*]] = extractelement <3 x double> [[COL_LOAD115]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT137:%.*]] = insertelement <2 x double> poison, double [[TMP94]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT138:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT137]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP95:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK136]], <2 x double> [[SPLAT_SPLAT138]], <2 x double> [[TMP93]])
+; CHECK-NEXT: [[BLOCK139:%.*]] = shufflevector <4 x double> [[COL_LOAD114]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP96:%.*]] = extractelement <3 x double> [[COL_LOAD115]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT140:%.*]] = insertelement <2 x double> poison, double [[TMP96]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT141:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT140]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP97:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK139]], <2 x double> [[SPLAT_SPLAT141]], <2 x double> [[TMP95]])
+; CHECK-NEXT: [[TMP98:%.*]] = shufflevector <2 x double> [[TMP97]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP99:%.*]] = shufflevector <4 x double> [[TMP91]], <4 x double> [[TMP98]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[BLOCK142:%.*]] = shufflevector <4 x double> [[TMP41]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK143:%.*]] = shufflevector <4 x double> [[COL_LOAD110]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP100:%.*]] = extractelement <3 x double> [[COL_LOAD117]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT144:%.*]] = insertelement <2 x double> poison, double [[TMP100]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT145:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT144]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP101:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK143]], <2 x double> [[SPLAT_SPLAT145]], <2 x double> [[BLOCK142]])
+; CHECK-NEXT: [[BLOCK146:%.*]] = shufflevector <4 x double> [[COL_LOAD112]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP102:%.*]] = extractelement <3 x double> [[COL_LOAD117]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT147:%.*]] = insertelement <2 x double> poison, double [[TMP102]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT148:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT147]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP103:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK146]], <2 x double> [[SPLAT_SPLAT148]], <2 x double> [[TMP101]])
+; CHECK-NEXT: [[BLOCK149:%.*]] = shufflevector <4 x double> [[COL_LOAD114]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP104:%.*]] = extractelement <3 x double> [[COL_LOAD117]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT150:%.*]] = insertelement <2 x double> poison, double [[TMP104]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT151:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT150]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP105:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK149]], <2 x double> [[SPLAT_SPLAT151]], <2 x double> [[TMP103]])
+; CHECK-NEXT: [[TMP106:%.*]] = shufflevector <2 x double> [[TMP105]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP107:%.*]] = shufflevector <4 x double> [[TMP41]], <4 x double> [[TMP106]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK152:%.*]] = shufflevector <4 x double> [[TMP107]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK153:%.*]] = shufflevector <4 x double> [[COL_LOAD110]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP108:%.*]] = extractelement <3 x double> [[COL_LOAD117]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT154:%.*]] = insertelement <2 x double> poison, double [[TMP108]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT155:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT154]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP109:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK153]], <2 x double> [[SPLAT_SPLAT155]], <2 x double> [[BLOCK152]])
+; CHECK-NEXT: [[BLOCK156:%.*]] = shufflevector <4 x double> [[COL_LOAD112]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP110:%.*]] = extractelement <3 x double> [[COL_LOAD117]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT157:%.*]] = insertelement <2 x double> poison, double [[TMP110]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT158:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT157]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP111:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK156]], <2 x double> [[SPLAT_SPLAT158]], <2 x double> [[TMP109]])
+; CHECK-NEXT: [[BLOCK159:%.*]] = shufflevector <4 x double> [[COL_LOAD114]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP112:%.*]] = extractelement <3 x double> [[COL_LOAD117]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT160:%.*]] = insertelement <2 x double> poison, double [[TMP112]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT161:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT160]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP113:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK159]], <2 x double> [[SPLAT_SPLAT161]], <2 x double> [[TMP111]])
+; CHECK-NEXT: [[TMP114:%.*]] = shufflevector <2 x double> [[TMP113]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP115:%.*]] = shufflevector <4 x double> [[TMP107]], <4 x double> [[TMP114]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[BLOCK162:%.*]] = shufflevector <4 x double> [[TMP61]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK163:%.*]] = shufflevector <4 x double> [[COL_LOAD110]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP116:%.*]] = extractelement <3 x double> [[COL_LOAD119]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT164:%.*]] = insertelement <2 x double> poison, double [[TMP116]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT165:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT164]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP117:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK163]], <2 x double> [[SPLAT_SPLAT165]], <2 x double> [[BLOCK162]])
+; CHECK-NEXT: [[BLOCK166:%.*]] = shufflevector <4 x double> [[COL_LOAD112]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP118:%.*]] = extractelement <3 x double> [[COL_LOAD119]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT167:%.*]] = insertelement <2 x double> poison, double [[TMP118]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT168:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT167]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP119:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK166]], <2 x double> [[SPLAT_SPLAT168]], <2 x double> [[TMP117]])
+; CHECK-NEXT: [[BLOCK169:%.*]] = shufflevector <4 x double> [[COL_LOAD114]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP120:%.*]] = extractelement <3 x double> [[COL_LOAD119]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT170:%.*]] = insertelement <2 x double> poison, double [[TMP120]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT171:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT170]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP121:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK169]], <2 x double> [[SPLAT_SPLAT171]], <2 x double> [[TMP119]])
+; CHECK-NEXT: [[TMP122:%.*]] = shufflevector <2 x double> [[TMP121]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP123:%.*]] = shufflevector <4 x double> [[TMP61]], <4 x double> [[TMP122]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK172:%.*]] = shufflevector <4 x double> [[TMP123]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK173:%.*]] = shufflevector <4 x double> [[COL_LOAD110]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP124:%.*]] = extractelement <3 x double> [[COL_LOAD119]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT174:%.*]] = insertelement <2 x double> poison, double [[TMP124]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT175:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT174]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP125:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK173]], <2 x double> [[SPLAT_SPLAT175]], <2 x double> [[BLOCK172]])
+; CHECK-NEXT: [[BLOCK176:%.*]] = shufflevector <4 x double> [[COL_LOAD112]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP126:%.*]] = extractelement <3 x double> [[COL_LOAD119]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT177:%.*]] = insertelement <2 x double> poison, double [[TMP126]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT178:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT177]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP127:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK176]], <2 x double> [[SPLAT_SPLAT178]], <2 x double> [[TMP125]])
+; CHECK-NEXT: [[BLOCK179:%.*]] = shufflevector <4 x double> [[COL_LOAD114]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP128:%.*]] = extractelement <3 x double> [[COL_LOAD119]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT180:%.*]] = insertelement <2 x double> poison, double [[TMP128]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT181:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT180]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP129:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK179]], <2 x double> [[SPLAT_SPLAT181]], <2 x double> [[TMP127]])
+; CHECK-NEXT: [[TMP130:%.*]] = shufflevector <2 x double> [[TMP129]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP131:%.*]] = shufflevector <4 x double> [[TMP123]], <4 x double> [[TMP130]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[BLOCK182:%.*]] = shufflevector <4 x double> [[TMP81]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK183:%.*]] = shufflevector <4 x double> [[COL_LOAD110]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP132:%.*]] = extractelement <3 x double> [[COL_LOAD121]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT184:%.*]] = insertelement <2 x double> poison, double [[TMP132]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT185:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT184]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP133:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK183]], <2 x double> [[SPLAT_SPLAT185]], <2 x double> [[BLOCK182]])
+; CHECK-NEXT: [[BLOCK186:%.*]] = shufflevector <4 x double> [[COL_LOAD112]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP134:%.*]] = extractelement <3 x double> [[COL_LOAD121]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT187:%.*]] = insertelement <2 x double> poison, double [[TMP134]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT188:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT187]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP135:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK186]], <2 x double> [[SPLAT_SPLAT188]], <2 x double> [[TMP133]])
+; CHECK-NEXT: [[BLOCK189:%.*]] = shufflevector <4 x double> [[COL_LOAD114]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP136:%.*]] = extractelement <3 x double> [[COL_LOAD121]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT190:%.*]] = insertelement <2 x double> poison, double [[TMP136]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT191:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT190]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP137:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK189]], <2 x double> [[SPLAT_SPLAT191]], <2 x double> [[TMP135]])
+; CHECK-NEXT: [[TMP138:%.*]] = shufflevector <2 x double> [[TMP137]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP139:%.*]] = shufflevector <4 x double> [[TMP81]], <4 x double> [[TMP138]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK192:%.*]] = shufflevector <4 x double> [[TMP139]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK193:%.*]] = shufflevector <4 x double> [[COL_LOAD110]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP140:%.*]] = extractelement <3 x double> [[COL_LOAD121]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT194:%.*]] = insertelement <2 x double> poison, double [[TMP140]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT195:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT194]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP141:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK193]], <2 x double> [[SPLAT_SPLAT195]], <2 x double> [[BLOCK192]])
+; CHECK-NEXT: [[BLOCK196:%.*]] = shufflevector <4 x double> [[COL_LOAD112]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP142:%.*]] = extractelement <3 x double> [[COL_LOAD121]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT197:%.*]] = insertelement <2 x double> poison, double [[TMP142]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT198:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT197]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP143:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK196]], <2 x double> [[SPLAT_SPLAT198]], <2 x double> [[TMP141]])
+; CHECK-NEXT: [[BLOCK199:%.*]] = shufflevector <4 x double> [[COL_LOAD114]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP144:%.*]] = extractelement <3 x double> [[COL_LOAD121]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT200:%.*]] = insertelement <2 x double> poison, double [[TMP144]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT201:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT200]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP145:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK199]], <2 x double> [[SPLAT_SPLAT201]], <2 x double> [[TMP143]])
+; CHECK-NEXT: [[TMP146:%.*]] = shufflevector <2 x double> [[TMP145]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP147:%.*]] = shufflevector <4 x double> [[TMP139]], <4 x double> [[TMP146]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[TMP148:%.*]] = getelementptr double, ptr [[C]], i64 0
+; CHECK-NEXT: store <4 x double> [[TMP99]], ptr [[TMP148]], align 8
+; CHECK-NEXT: [[VEC_GEP202:%.*]] = getelementptr double, ptr [[TMP148]], i64 7
+; CHECK-NEXT: store <4 x double> [[TMP115]], ptr [[VEC_GEP202]], align 8
+; CHECK-NEXT: [[VEC_GEP203:%.*]] = getelementptr double, ptr [[TMP148]], i64 14
+; CHECK-NEXT: store <4 x double> [[TMP131]], ptr [[VEC_GEP203]], align 8
+; CHECK-NEXT: [[VEC_GEP204:%.*]] = getelementptr double, ptr [[TMP148]], i64 21
+; CHECK-NEXT: store <4 x double> [[TMP147]], ptr [[VEC_GEP204]], align 8
+; CHECK-NEXT: [[TMP149:%.*]] = getelementptr double, ptr [[A]], i64 4
+; CHECK-NEXT: [[COL_LOAD205:%.*]] = load <3 x double>, ptr [[TMP149]], align 8
+; CHECK-NEXT: [[VEC_GEP206:%.*]] = getelementptr double, ptr [[TMP149]], i64 7
+; CHECK-NEXT: [[COL_LOAD207:%.*]] = load <3 x double>, ptr [[VEC_GEP206]], align 8
+; CHECK-NEXT: [[VEC_GEP208:%.*]] = getelementptr double, ptr [[TMP149]], i64 14
+; CHECK-NEXT: [[COL_LOAD209:%.*]] = load <3 x double>, ptr [[VEC_GEP208]], align 8
+; CHECK-NEXT: [[VEC_GEP210:%.*]] = getelementptr double, ptr [[TMP149]], i64 21
+; CHECK-NEXT: [[COL_LOAD211:%.*]] = load <3 x double>, ptr [[VEC_GEP210]], align 8
+; CHECK-NEXT: [[TMP150:%.*]] = getelementptr double, ptr [[B]], i64 0
+; CHECK-NEXT: [[COL_LOAD212:%.*]] = load <4 x double>, ptr [[TMP150]], align 8
+; CHECK-NEXT: [[VEC_GEP213:%.*]] = getelementptr double, ptr [[TMP150]], i64 7
+; CHECK-NEXT: [[COL_LOAD214:%.*]] = load <4 x double>, ptr [[VEC_GEP213]], align 8
+; CHECK-NEXT: [[VEC_GEP215:%.*]] = getelementptr double, ptr [[TMP150]], i64 14
+; CHECK-NEXT: [[COL_LOAD216:%.*]] = load <4 x double>, ptr [[VEC_GEP215]], align 8
+; CHECK-NEXT: [[VEC_GEP217:%.*]] = getelementptr double, ptr [[TMP150]], i64 21
+; CHECK-NEXT: [[COL_LOAD218:%.*]] = load <4 x double>, ptr [[VEC_GEP217]], align 8
+; CHECK-NEXT: [[BLOCK219:%.*]] = shufflevector <3 x double> [[COL_LOAD205]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP151:%.*]] = extractelement <4 x double> [[COL_LOAD212]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT220:%.*]] = insertelement <2 x double> poison, double [[TMP151]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT221:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT220]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP152:%.*]] = fmul contract <2 x double> [[BLOCK219]], [[SPLAT_SPLAT221]]
+; CHECK-NEXT: [[BLOCK222:%.*]] = shufflevector <3 x double> [[COL_LOAD207]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP153:%.*]] = extractelement <4 x double> [[COL_LOAD212]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT223:%.*]] = insertelement <2 x double> poison, double [[TMP153]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT224:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT223]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP154:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK222]], <2 x double> [[SPLAT_SPLAT224]], <2 x double> [[TMP152]])
+; CHECK-NEXT: [[BLOCK225:%.*]] = shufflevector <3 x double> [[COL_LOAD209]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP155:%.*]] = extractelement <4 x double> [[COL_LOAD212]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT226:%.*]] = insertelement <2 x double> poison, double [[TMP155]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT227:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT226]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP156:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK225]], <2 x double> [[SPLAT_SPLAT227]], <2 x double> [[TMP154]])
+; CHECK-NEXT: [[BLOCK228:%.*]] = shufflevector <3 x double> [[COL_LOAD211]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP157:%.*]] = extractelement <4 x double> [[COL_LOAD212]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT229:%.*]] = insertelement <2 x double> poison, double [[TMP157]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT230:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT229]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP158:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK228]], <2 x double> [[SPLAT_SPLAT230]], <2 x double> [[TMP156]])
+; CHECK-NEXT: [[TMP159:%.*]] = shufflevector <2 x double> [[TMP158]], <2 x double> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT: [[TMP160:%.*]] = shufflevector <3 x double> zeroinitializer, <3 x double> [[TMP159]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT: [[BLOCK231:%.*]] = shufflevector <3 x double> [[TMP160]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[BLOCK232:%.*]] = shufflevector <3 x double> [[COL_LOAD205]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP161:%.*]] = extractelement <4 x double> [[COL_LOAD212]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT233:%.*]] = insertelement <1 x double> poison, double [[TMP161]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT234:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT233]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP162:%.*]] = fmul contract <1 x double> [[BLOCK232]], [[SPLAT_SPLAT234]]
+; CHECK-NEXT: [[BLOCK235:%.*]] = shufflevector <3 x double> [[COL_LOAD207]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP163:%.*]] = extractelement <4 x double> [[COL_LOAD212]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT236:%.*]] = insertelement <1 x double> poison, double [[TMP163]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT237:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT236]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP164:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK235]], <1 x double> [[SPLAT_SPLAT237]], <1 x double> [[TMP162]])
+; CHECK-NEXT: [[BLOCK238:%.*]] = shufflevector <3 x double> [[COL_LOAD209]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP165:%.*]] = extractelement <4 x double> [[COL_LOAD212]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT239:%.*]] = insertelement <1 x double> poison, double [[TMP165]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT240:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT239]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP166:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK238]], <1 x double> [[SPLAT_SPLAT240]], <1 x double> [[TMP164]])
+; CHECK-NEXT: [[BLOCK241:%.*]] = shufflevector <3 x double> [[COL_LOAD211]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP167:%.*]] = extractelement <4 x double> [[COL_LOAD212]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT242:%.*]] = insertelement <1 x double> poison, double [[TMP167]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT243:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT242]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP168:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK241]], <1 x double> [[SPLAT_SPLAT243]], <1 x double> [[TMP166]])
+; CHECK-NEXT: [[TMP169:%.*]] = shufflevector <1 x double> [[TMP168]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP170:%.*]] = shufflevector <3 x double> [[TMP160]], <3 x double> [[TMP169]], <3 x i32> <i32 0, i32 1, i32 3>
+; CHECK-NEXT: [[BLOCK244:%.*]] = shufflevector <3 x double> [[COL_LOAD205]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP171:%.*]] = extractelement <4 x double> [[COL_LOAD214]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT245:%.*]] = insertelement <2 x double> poison, double [[TMP171]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT246:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT245]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP172:%.*]] = fmul contract <2 x double> [[BLOCK244]], [[SPLAT_SPLAT246]]
+; CHECK-NEXT: [[BLOCK247:%.*]] = shufflevector <3 x double> [[COL_LOAD207]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP173:%.*]] = extractelement <4 x double> [[COL_LOAD214]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT248:%.*]] = insertelement <2 x double> poison, double [[TMP173]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT249:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT248]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP174:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK247]], <2 x double> [[SPLAT_SPLAT249]], <2 x double> [[TMP172]])
+; CHECK-NEXT: [[BLOCK250:%.*]] = shufflevector <3 x double> [[COL_LOAD209]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP175:%.*]] = extractelement <4 x double> [[COL_LOAD214]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT251:%.*]] = insertelement <2 x double> poison, double [[TMP175]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT252:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT251]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP176:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK250]], <2 x double> [[SPLAT_SPLAT252]], <2 x double> [[TMP174]])
+; CHECK-NEXT: [[BLOCK253:%.*]] = shufflevector <3 x double> [[COL_LOAD211]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP177:%.*]] = extractelement <4 x double> [[COL_LOAD214]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT254:%.*]] = insertelement <2 x double> poison, double [[TMP177]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT255:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT254]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP178:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK253]], <2 x double> [[SPLAT_SPLAT255]], <2 x double> [[TMP176]])
+; CHECK-NEXT: [[TMP179:%.*]] = shufflevector <2 x double> [[TMP178]], <2 x double> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT: [[TMP180:%.*]] = shufflevector <3 x double> zeroinitializer, <3 x double> [[TMP179]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT: [[BLOCK256:%.*]] = shufflevector <3 x double> [[TMP180]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[BLOCK257:%.*]] = shufflevector <3 x double> [[COL_LOAD205]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP181:%.*]] = extractelement <4 x double> [[COL_LOAD214]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT258:%.*]] = insertelement <1 x double> poison, double [[TMP181]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT259:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT258]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP182:%.*]] = fmul contract <1 x double> [[BLOCK257]], [[SPLAT_SPLAT259]]
+; CHECK-NEXT: [[BLOCK260:%.*]] = shufflevector <3 x double> [[COL_LOAD207]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP183:%.*]] = extractelement <4 x double> [[COL_LOAD214]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT261:%.*]] = insertelement <1 x double> poison, double [[TMP183]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT262:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT261]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP184:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK260]], <1 x double> [[SPLAT_SPLAT262]], <1 x double> [[TMP182]])
+; CHECK-NEXT: [[BLOCK263:%.*]] = shufflevector <3 x double> [[COL_LOAD209]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP185:%.*]] = extractelement <4 x double> [[COL_LOAD214]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT264:%.*]] = insertelement <1 x double> poison, double [[TMP185]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT265:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT264]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP186:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK263]], <1 x double> [[SPLAT_SPLAT265]], <1 x double> [[TMP184]])
+; CHECK-NEXT: [[BLOCK266:%.*]] = shufflevector <3 x double> [[COL_LOAD211]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP187:%.*]] = extractelement <4 x double> [[COL_LOAD214]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT267:%.*]] = insertelement <1 x double> poison, double [[TMP187]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT268:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT267]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP188:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK266]], <1 x double> [[SPLAT_SPLAT268]], <1 x double> [[TMP186]])
+; CHECK-NEXT: [[TMP189:%.*]] = shufflevector <1 x double> [[TMP188]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP190:%.*]] = shufflevector <3 x double> [[TMP180]], <3 x double> [[TMP189]], <3 x i32> <i32 0, i32 1, i32 3>
+; CHECK-NEXT: [[BLOCK269:%.*]] = shufflevector <3 x double> [[COL_LOAD205]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP191:%.*]] = extractelement <4 x double> [[COL_LOAD216]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT270:%.*]] = insertelement <2 x double> poison, double [[TMP191]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT271:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT270]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP192:%.*]] = fmul contract <2 x double> [[BLOCK269]], [[SPLAT_SPLAT271]]
+; CHECK-NEXT: [[BLOCK272:%.*]] = shufflevector <3 x double> [[COL_LOAD207]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP193:%.*]] = extractelement <4 x double> [[COL_LOAD216]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT273:%.*]] = insertelement <2 x double> poison, double [[TMP193]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT274:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT273]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP194:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK272]], <2 x double> [[SPLAT_SPLAT274]], <2 x double> [[TMP192]])
+; CHECK-NEXT: [[BLOCK275:%.*]] = shufflevector <3 x double> [[COL_LOAD209]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP195:%.*]] = extractelement <4 x double> [[COL_LOAD216]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT276:%.*]] = insertelement <2 x double> poison, double [[TMP195]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT277:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT276]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP196:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK275]], <2 x double> [[SPLAT_SPLAT277]], <2 x double> [[TMP194]])
+; CHECK-NEXT: [[BLOCK278:%.*]] = shufflevector <3 x double> [[COL_LOAD211]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP197:%.*]] = extractelement <4 x double> [[COL_LOAD216]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT279:%.*]] = insertelement <2 x double> poison, double [[TMP197]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT280:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT279]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP198:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK278]], <2 x double> [[SPLAT_SPLAT280]], <2 x double> [[TMP196]])
+; CHECK-NEXT: [[TMP199:%.*]] = shufflevector <2 x double> [[TMP198]], <2 x double> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT: [[TMP200:%.*]] = shufflevector <3 x double> zeroinitializer, <3 x double> [[TMP199]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT: [[BLOCK281:%.*]] = shufflevector <3 x double> [[TMP200]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[BLOCK282:%.*]] = shufflevector <3 x double> [[COL_LOAD205]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP201:%.*]] = extractelement <4 x double> [[COL_LOAD216]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT283:%.*]] = insertelement <1 x double> poison, double [[TMP201]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT284:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT283]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP202:%.*]] = fmul contract <1 x double> [[BLOCK282]], [[SPLAT_SPLAT284]]
+; CHECK-NEXT: [[BLOCK285:%.*]] = shufflevector <3 x double> [[COL_LOAD207]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP203:%.*]] = extractelement <4 x double> [[COL_LOAD216]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT286:%.*]] = insertelement <1 x double> poison, double [[TMP203]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT287:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT286]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP204:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK285]], <1 x double> [[SPLAT_SPLAT287]], <1 x double> [[TMP202]])
+; CHECK-NEXT: [[BLOCK288:%.*]] = shufflevector <3 x double> [[COL_LOAD209]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP205:%.*]] = extractelement <4 x double> [[COL_LOAD216]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT289:%.*]] = insertelement <1 x double> poison, double [[TMP205]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT290:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT289]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP206:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK288]], <1 x double> [[SPLAT_SPLAT290]], <1 x double> [[TMP204]])
+; CHECK-NEXT: [[BLOCK291:%.*]] = shufflevector <3 x double> [[COL_LOAD211]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP207:%.*]] = extractelement <4 x double> [[COL_LOAD216]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT292:%.*]] = insertelement <1 x double> poison, double [[TMP207]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT293:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT292]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP208:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK291]], <1 x double> [[SPLAT_SPLAT293]], <1 x double> [[TMP206]])
+; CHECK-NEXT: [[TMP209:%.*]] = shufflevector <1 x double> [[TMP208]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP210:%.*]] = shufflevector <3 x double> [[TMP200]], <3 x double> [[TMP209]], <3 x i32> <i32 0, i32 1, i32 3>
+; CHECK-NEXT: [[BLOCK294:%.*]] = shufflevector <3 x double> [[COL_LOAD205]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP211:%.*]] = extractelement <4 x double> [[COL_LOAD218]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT295:%.*]] = insertelement <2 x double> poison, double [[TMP211]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT296:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT295]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP212:%.*]] = fmul contract <2 x double> [[BLOCK294]], [[SPLAT_SPLAT296]]
+; CHECK-NEXT: [[BLOCK297:%.*]] = shufflevector <3 x double> [[COL_LOAD207]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP213:%.*]] = extractelement <4 x double> [[COL_LOAD218]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT298:%.*]] = insertelement <2 x double> poison, double [[TMP213]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT299:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT298]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP214:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK297]], <2 x double> [[SPLAT_SPLAT299]], <2 x double> [[TMP212]])
+; CHECK-NEXT: [[BLOCK300:%.*]] = shufflevector <3 x double> [[COL_LOAD209]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP215:%.*]] = extractelement <4 x double> [[COL_LOAD218]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT301:%.*]] = insertelement <2 x double> poison, double [[TMP215]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT302:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT301]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP216:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK300]], <2 x double> [[SPLAT_SPLAT302]], <2 x double> [[TMP214]])
+; CHECK-NEXT: [[BLOCK303:%.*]] = shufflevector <3 x double> [[COL_LOAD211]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP217:%.*]] = extractelement <4 x double> [[COL_LOAD218]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT304:%.*]] = insertelement <2 x double> poison, double [[TMP217]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT305:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT304]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP218:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK303]], <2 x double> [[SPLAT_SPLAT305]], <2 x double> [[TMP216]])
+; CHECK-NEXT: [[TMP219:%.*]] = shufflevector <2 x double> [[TMP218]], <2 x double> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT: [[TMP220:%.*]] = shufflevector <3 x double> zeroinitializer, <3 x double> [[TMP219]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT: [[BLOCK306:%.*]] = shufflevector <3 x double> [[TMP220]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[BLOCK307:%.*]] = shufflevector <3 x double> [[COL_LOAD205]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP221:%.*]] = extractelement <4 x double> [[COL_LOAD218]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT308:%.*]] = insertelement <1 x double> poison, double [[TMP221]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT309:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT308]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP222:%.*]] = fmul contract <1 x double> [[BLOCK307]], [[SPLAT_SPLAT309]]
+; CHECK-NEXT: [[BLOCK310:%.*]] = shufflevector <3 x double> [[COL_LOAD207]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP223:%.*]] = extractelement <4 x double> [[COL_LOAD218]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT311:%.*]] = insertelement <1 x double> poison, double [[TMP223]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT312:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT311]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP224:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK310]], <1 x double> [[SPLAT_SPLAT312]], <1 x double> [[TMP222]])
+; CHECK-NEXT: [[BLOCK313:%.*]] = shufflevector <3 x double> [[COL_LOAD209]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP225:%.*]] = extractelement <4 x double> [[COL_LOAD218]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT314:%.*]] = insertelement <1 x double> poison, double [[TMP225]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT315:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT314]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP226:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK313]], <1 x double> [[SPLAT_SPLAT315]], <1 x double> [[TMP224]])
+; CHECK-NEXT: [[BLOCK316:%.*]] = shufflevector <3 x double> [[COL_LOAD211]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP227:%.*]] = extractelement <4 x double> [[COL_LOAD218]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT317:%.*]] = insertelement <1 x double> poison, double [[TMP227]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT318:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT317]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP228:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK316]], <1 x double> [[SPLAT_SPLAT318]], <1 x double> [[TMP226]])
+; CHECK-NEXT: [[TMP229:%.*]] = shufflevector <1 x double> [[TMP228]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP230:%.*]] = shufflevector <3 x double> [[TMP220]], <3 x double> [[TMP229]], <3 x i32> <i32 0, i32 1, i32 3>
+; CHECK-NEXT: [[TMP231:%.*]] = getelementptr double, ptr [[A]], i64 32
+; CHECK-NEXT: [[COL_LOAD319:%.*]] = load <3 x double>, ptr [[TMP231]], align 8
+; CHECK-NEXT: [[VEC_GEP320:%.*]] = getelementptr double, ptr [[TMP231]], i64 7
+; CHECK-NEXT: [[COL_LOAD321:%.*]] = load <3 x double>, ptr [[VEC_GEP320]], align 8
+; CHECK-NEXT: [[VEC_GEP322:%.*]] = getelementptr double, ptr [[TMP231]], i64 14
+; CHECK-NEXT: [[COL_LOAD323:%.*]] = load <3 x double>, ptr [[VEC_GEP322]], align 8
+; CHECK-NEXT: [[TMP232:%.*]] = getelementptr double, ptr [[B]], i64 4
+; CHECK-NEXT: [[COL_LOAD324:%.*]] = load <3 x double>, ptr [[TMP232]], align 8
+; CHECK-NEXT: [[VEC_GEP325:%.*]] = getelementptr double, ptr [[TMP232]], i64 7
+; CHECK-NEXT: [[COL_LOAD326:%.*]] = load <3 x double>, ptr [[VEC_GEP325]], align 8
+; CHECK-NEXT: [[VEC_GEP327:%.*]] = getelementptr double, ptr [[TMP232]], i64 14
+; CHECK-NEXT: [[COL_LOAD328:%.*]] = load <3 x double>, ptr [[VEC_GEP327]], align 8
+; CHECK-NEXT: [[VEC_GEP329:%.*]] = getelementptr double, ptr [[TMP232]], i64 21
+; CHECK-NEXT: [[COL_LOAD330:%.*]] = load <3 x double>, ptr [[VEC_GEP329]], align 8
+; CHECK-NEXT: [[BLOCK331:%.*]] = shufflevector <3 x double> [[TMP170]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK332:%.*]] = shufflevector <3 x double> [[COL_LOAD319]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP233:%.*]] = extractelement <3 x double> [[COL_LOAD324]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT333:%.*]] = insertelement <2 x double> poison, double [[TMP233]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT334:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT333]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP234:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK332]], <2 x double> [[SPLAT_SPLAT334]], <2 x double> [[BLOCK331]])
+; CHECK-NEXT: [[BLOCK335:%.*]] = shufflevector <3 x double> [[COL_LOAD321]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP235:%.*]] = extractelement <3 x double> [[COL_LOAD324]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT336:%.*]] = insertelement <2 x double> poison, double [[TMP235]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT337:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT336]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP236:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK335]], <2 x double> [[SPLAT_SPLAT337]], <2 x double> [[TMP234]])
+; CHECK-NEXT: [[BLOCK338:%.*]] = shufflevector <3 x double> [[COL_LOAD323]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP237:%.*]] = extractelement <3 x double> [[COL_LOAD324]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT339:%.*]] = insertelement <2 x double> poison, double [[TMP237]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT340:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT339]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP238:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK338]], <2 x double> [[SPLAT_SPLAT340]], <2 x double> [[TMP236]])
+; CHECK-NEXT: [[TMP239:%.*]] = shufflevector <2 x double> [[TMP238]], <2 x double> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT: [[TMP240:%.*]] = shufflevector <3 x double> [[TMP170]], <3 x double> [[TMP239]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT: [[BLOCK341:%.*]] = shufflevector <3 x double> [[TMP240]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[BLOCK342:%.*]] = shufflevector <3 x double> [[COL_LOAD319]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP241:%.*]] = extractelement <3 x double> [[COL_LOAD324]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT343:%.*]] = insertelement <1 x double> poison, double [[TMP241]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT344:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT343]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP242:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK342]], <1 x double> [[SPLAT_SPLAT344]], <1 x double> [[BLOCK341]])
+; CHECK-NEXT: [[BLOCK345:%.*]] = shufflevector <3 x double> [[COL_LOAD321]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP243:%.*]] = extractelement <3 x double> [[COL_LOAD324]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT346:%.*]] = insertelement <1 x double> poison, double [[TMP243]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT347:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT346]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP244:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK345]], <1 x double> [[SPLAT_SPLAT347]], <1 x double> [[TMP242]])
+; CHECK-NEXT: [[BLOCK348:%.*]] = shufflevector <3 x double> [[COL_LOAD323]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP245:%.*]] = extractelement <3 x double> [[COL_LOAD324]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT349:%.*]] = insertelement <1 x double> poison, double [[TMP245]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT350:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT349]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP246:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK348]], <1 x double> [[SPLAT_SPLAT350]], <1 x double> [[TMP244]])
+; CHECK-NEXT: [[TMP247:%.*]] = shufflevector <1 x double> [[TMP246]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP248:%.*]] = shufflevector <3 x double> [[TMP240]], <3 x double> [[TMP247]], <3 x i32> <i32 0, i32 1, i32 3>
+; CHECK-NEXT: [[BLOCK351:%.*]] = shufflevector <3 x double> [[TMP190]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK352:%.*]] = shufflevector <3 x double> [[COL_LOAD319]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP249:%.*]] = extractelement <3 x double> [[COL_LOAD326]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT353:%.*]] = insertelement <2 x double> poison, double [[TMP249]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT354:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT353]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP250:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK352]], <2 x double> [[SPLAT_SPLAT354]], <2 x double> [[BLOCK351]])
+; CHECK-NEXT: [[BLOCK355:%.*]] = shufflevector <3 x double> [[COL_LOAD321]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP251:%.*]] = extractelement <3 x double> [[COL_LOAD326]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT356:%.*]] = insertelement <2 x double> poison, double [[TMP251]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT357:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT356]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP252:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK355]], <2 x double> [[SPLAT_SPLAT357]], <2 x double> [[TMP250]])
+; CHECK-NEXT: [[BLOCK358:%.*]] = shufflevector <3 x double> [[COL_LOAD323]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP253:%.*]] = extractelement <3 x double> [[COL_LOAD326]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT359:%.*]] = insertelement <2 x double> poison, double [[TMP253]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT360:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT359]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP254:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK358]], <2 x double> [[SPLAT_SPLAT360]], <2 x double> [[TMP252]])
+; CHECK-NEXT: [[TMP255:%.*]] = shufflevector <2 x double> [[TMP254]], <2 x double> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT: [[TMP256:%.*]] = shufflevector <3 x double> [[TMP190]], <3 x double> [[TMP255]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT: [[BLOCK361:%.*]] = shufflevector <3 x double> [[TMP256]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[BLOCK362:%.*]] = shufflevector <3 x double> [[COL_LOAD319]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP257:%.*]] = extractelement <3 x double> [[COL_LOAD326]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT363:%.*]] = insertelement <1 x double> poison, double [[TMP257]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT364:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT363]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP258:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK362]], <1 x double> [[SPLAT_SPLAT364]], <1 x double> [[BLOCK361]])
+; CHECK-NEXT: [[BLOCK365:%.*]] = shufflevector <3 x double> [[COL_LOAD321]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP259:%.*]] = extractelement <3 x double> [[COL_LOAD326]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT366:%.*]] = insertelement <1 x double> poison, double [[TMP259]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT367:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT366]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP260:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK365]], <1 x double> [[SPLAT_SPLAT367]], <1 x double> [[TMP258]])
+; CHECK-NEXT: [[BLOCK368:%.*]] = shufflevector <3 x double> [[COL_LOAD323]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP261:%.*]] = extractelement <3 x double> [[COL_LOAD326]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT369:%.*]] = insertelement <1 x double> poison, double [[TMP261]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT370:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT369]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP262:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK368]], <1 x double> [[SPLAT_SPLAT370]], <1 x double> [[TMP260]])
+; CHECK-NEXT: [[TMP263:%.*]] = shufflevector <1 x double> [[TMP262]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP264:%.*]] = shufflevector <3 x double> [[TMP256]], <3 x double> [[TMP263]], <3 x i32> <i32 0, i32 1, i32 3>
+; CHECK-NEXT: [[BLOCK371:%.*]] = shufflevector <3 x double> [[TMP210]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK372:%.*]] = shufflevector <3 x double> [[COL_LOAD319]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP265:%.*]] = extractelement <3 x double> [[COL_LOAD328]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT373:%.*]] = insertelement <2 x double> poison, double [[TMP265]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT374:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT373]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP266:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK372]], <2 x double> [[SPLAT_SPLAT374]], <2 x double> [[BLOCK371]])
+; CHECK-NEXT: [[BLOCK375:%.*]] = shufflevector <3 x double> [[COL_LOAD321]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP267:%.*]] = extractelement <3 x double> [[COL_LOAD328]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT376:%.*]] = insertelement <2 x double> poison, double [[TMP267]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT377:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT376]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP268:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK375]], <2 x double> [[SPLAT_SPLAT377]], <2 x double> [[TMP266]])
+; CHECK-NEXT: [[BLOCK378:%.*]] = shufflevector <3 x double> [[COL_LOAD323]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP269:%.*]] = extractelement <3 x double> [[COL_LOAD328]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT379:%.*]] = insertelement <2 x double> poison, double [[TMP269]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT380:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT379]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP270:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK378]], <2 x double> [[SPLAT_SPLAT380]], <2 x double> [[TMP268]])
+; CHECK-NEXT: [[TMP271:%.*]] = shufflevector <2 x double> [[TMP270]], <2 x double> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT: [[TMP272:%.*]] = shufflevector <3 x double> [[TMP210]], <3 x double> [[TMP271]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT: [[BLOCK381:%.*]] = shufflevector <3 x double> [[TMP272]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[BLOCK382:%.*]] = shufflevector <3 x double> [[COL_LOAD319]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP273:%.*]] = extractelement <3 x double> [[COL_LOAD328]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT383:%.*]] = insertelement <1 x double> poison, double [[TMP273]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT384:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT383]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP274:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK382]], <1 x double> [[SPLAT_SPLAT384]], <1 x double> [[BLOCK381]])
+; CHECK-NEXT: [[BLOCK385:%.*]] = shufflevector <3 x double> [[COL_LOAD321]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP275:%.*]] = extractelement <3 x double> [[COL_LOAD328]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT386:%.*]] = insertelement <1 x double> poison, double [[TMP275]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT387:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT386]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP276:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK385]], <1 x double> [[SPLAT_SPLAT387]], <1 x double> [[TMP274]])
+; CHECK-NEXT: [[BLOCK388:%.*]] = shufflevector <3 x double> [[COL_LOAD323]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP277:%.*]] = extractelement <3 x double> [[COL_LOAD328]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT389:%.*]] = insertelement <1 x double> poison, double [[TMP277]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT390:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT389]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP278:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK388]], <1 x double> [[SPLAT_SPLAT390]], <1 x double> [[TMP276]])
+; CHECK-NEXT: [[TMP279:%.*]] = shufflevector <1 x double> [[TMP278]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP280:%.*]] = shufflevector <3 x double> [[TMP272]], <3 x double> [[TMP279]], <3 x i32> <i32 0, i32 1, i32 3>
+; CHECK-NEXT: [[BLOCK391:%.*]] = shufflevector <3 x double> [[TMP230]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK392:%.*]] = shufflevector <3 x double> [[COL_LOAD319]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP281:%.*]] = extractelement <3 x double> [[COL_LOAD330]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT393:%.*]] = insertelement <2 x double> poison, double [[TMP281]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT394:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT393]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP282:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK392]], <2 x double> [[SPLAT_SPLAT394]], <2 x double> [[BLOCK391]])
+; CHECK-NEXT: [[BLOCK395:%.*]] = shufflevector <3 x double> [[COL_LOAD321]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP283:%.*]] = extractelement <3 x double> [[COL_LOAD330]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT396:%.*]] = insertelement <2 x double> poison, double [[TMP283]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT397:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT396]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP284:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK395]], <2 x double> [[SPLAT_SPLAT397]], <2 x double> [[TMP282]])
+; CHECK-NEXT: [[BLOCK398:%.*]] = shufflevector <3 x double> [[COL_LOAD323]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP285:%.*]] = extractelement <3 x double> [[COL_LOAD330]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT399:%.*]] = insertelement <2 x double> poison, double [[TMP285]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT400:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT399]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP286:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK398]], <2 x double> [[SPLAT_SPLAT400]], <2 x double> [[TMP284]])
+; CHECK-NEXT: [[TMP287:%.*]] = shufflevector <2 x double> [[TMP286]], <2 x double> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT: [[TMP288:%.*]] = shufflevector <3 x double> [[TMP230]], <3 x double> [[TMP287]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT: [[BLOCK401:%.*]] = shufflevector <3 x double> [[TMP288]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[BLOCK402:%.*]] = shufflevector <3 x double> [[COL_LOAD319]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP289:%.*]] = extractelement <3 x double> [[COL_LOAD330]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT403:%.*]] = insertelement <1 x double> poison, double [[TMP289]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT404:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT403]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP290:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK402]], <1 x double> [[SPLAT_SPLAT404]], <1 x double> [[BLOCK401]])
+; CHECK-NEXT: [[BLOCK405:%.*]] = shufflevector <3 x double> [[COL_LOAD321]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP291:%.*]] = extractelement <3 x double> [[COL_LOAD330]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT406:%.*]] = insertelement <1 x double> poison, double [[TMP291]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT407:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT406]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP292:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK405]], <1 x double> [[SPLAT_SPLAT407]], <1 x double> [[TMP290]])
+; CHECK-NEXT: [[BLOCK408:%.*]] = shufflevector <3 x double> [[COL_LOAD323]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP293:%.*]] = extractelement <3 x double> [[COL_LOAD330]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT409:%.*]] = insertelement <1 x double> poison, double [[TMP293]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT410:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT409]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP294:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK408]], <1 x double> [[SPLAT_SPLAT410]], <1 x double> [[TMP292]])
+; CHECK-NEXT: [[TMP295:%.*]] = shufflevector <1 x double> [[TMP294]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP296:%.*]] = shufflevector <3 x double> [[TMP288]], <3 x double> [[TMP295]], <3 x i32> <i32 0, i32 1, i32 3>
+; CHECK-NEXT: [[TMP297:%.*]] = getelementptr double, ptr [[C]], i64 4
+; CHECK-NEXT: store <3 x double> [[TMP248]], ptr [[TMP297]], align 8
+; CHECK-NEXT: [[VEC_GEP411:%.*]] = getelementptr double, ptr [[TMP297]], i64 7
+; CHECK-NEXT: store <3 x double> [[TMP264]], ptr [[VEC_GEP411]], align 8
+; CHECK-NEXT: [[VEC_GEP412:%.*]] = getelementptr double, ptr [[TMP297]], i64 14
+; CHECK-NEXT: store <3 x double> [[TMP280]], ptr [[VEC_GEP412]], align 8
+; CHECK-NEXT: [[VEC_GEP413:%.*]] = getelementptr double, ptr [[TMP297]], i64 21
+; CHECK-NEXT: store <3 x double> [[TMP296]], ptr [[VEC_GEP413]], align 8
+; CHECK-NEXT: [[TMP298:%.*]] = getelementptr double, ptr [[A]], i64 0
+; CHECK-NEXT: [[COL_LOAD414:%.*]] = load <4 x double>, ptr [[TMP298]], align 8
+; CHECK-NEXT: [[VEC_GEP415:%.*]] = getelementptr double, ptr [[TMP298]], i64 7
+; CHECK-NEXT: [[COL_LOAD416:%.*]] = load <4 x double>, ptr [[VEC_GEP415]], align 8
+; CHECK-NEXT: [[VEC_GEP417:%.*]] = getelementptr double, ptr [[TMP298]], i64 14
+; CHECK-NEXT: [[COL_LOAD418:%.*]] = load <4 x double>, ptr [[VEC_GEP417]], align 8
+; CHECK-NEXT: [[VEC_GEP419:%.*]] = getelementptr double, ptr [[TMP298]], i64 21
+; CHECK-NEXT: [[COL_LOAD420:%.*]] = load <4 x double>, ptr [[VEC_GEP419]], align 8
+; CHECK-NEXT: [[TMP299:%.*]] = getelementptr double, ptr [[B]], i64 28
+; CHECK-NEXT: [[COL_LOAD421:%.*]] = load <4 x double>, ptr [[TMP299]], align 8
+; CHECK-NEXT: [[VEC_GEP422:%.*]] = getelementptr double, ptr [[TMP299]], i64 7
+; CHECK-NEXT: [[COL_LOAD423:%.*]] = load <4 x double>, ptr [[VEC_GEP422]], align 8
+; CHECK-NEXT: [[VEC_GEP424:%.*]] = getelementptr double, ptr [[TMP299]], i64 14
+; CHECK-NEXT: [[COL_LOAD425:%.*]] = load <4 x double>, ptr [[VEC_GEP424]], align 8
+; CHECK-NEXT: [[BLOCK426:%.*]] = shufflevector <4 x double> [[COL_LOAD414]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP300:%.*]] = extractelement <4 x double> [[COL_LOAD421]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT427:%.*]] = insertelement <2 x double> poison, double [[TMP300]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT428:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT427]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP301:%.*]] = fmul contract <2 x double> [[BLOCK426]], [[SPLAT_SPLAT428]]
+; CHECK-NEXT: [[BLOCK429:%.*]] = shufflevector <4 x double> [[COL_LOAD416]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP302:%.*]] = extractelement <4 x double> [[COL_LOAD421]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT430:%.*]] = insertelement <2 x double> poison, double [[TMP302]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT431:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT430]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP303:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK429]], <2 x double> [[SPLAT_SPLAT431]], <2 x double> [[TMP301]])
+; CHECK-NEXT: [[BLOCK432:%.*]] = shufflevector <4 x double> [[COL_LOAD418]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP304:%.*]] = extractelement <4 x double> [[COL_LOAD421]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT433:%.*]] = insertelement <2 x double> poison, double [[TMP304]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT434:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT433]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP305:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK432]], <2 x double> [[SPLAT_SPLAT434]], <2 x double> [[TMP303]])
+; CHECK-NEXT: [[BLOCK435:%.*]] = shufflevector <4 x double> [[COL_LOAD420]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP306:%.*]] = extractelement <4 x double> [[COL_LOAD421]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT436:%.*]] = insertelement <2 x double> poison, double [[TMP306]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT437:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT436]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP307:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK435]], <2 x double> [[SPLAT_SPLAT437]], <2 x double> [[TMP305]])
+; CHECK-NEXT: [[TMP308:%.*]] = shufflevector <2 x double> [[TMP307]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP309:%.*]] = shufflevector <4 x double> zeroinitializer, <4 x double> [[TMP308]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK438:%.*]] = shufflevector <4 x double> [[TMP309]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK439:%.*]] = shufflevector <4 x double> [[COL_LOAD414]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP310:%.*]] = extractelement <4 x double> [[COL_LOAD421]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT440:%.*]] = insertelement <2 x double> poison, double [[TMP310]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT441:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT440]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP311:%.*]] = fmul contract <2 x double> [[BLOCK439]], [[SPLAT_SPLAT441]]
+; CHECK-NEXT: [[BLOCK442:%.*]] = shufflevector <4 x double> [[COL_LOAD416]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP312:%.*]] = extractelement <4 x double> [[COL_LOAD421]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT443:%.*]] = insertelement <2 x double> poison, double [[TMP312]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT444:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT443]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP313:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK442]], <2 x double> [[SPLAT_SPLAT444]], <2 x double> [[TMP311]])
+; CHECK-NEXT: [[BLOCK445:%.*]] = shufflevector <4 x double> [[COL_LOAD418]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP314:%.*]] = extractelement <4 x double> [[COL_LOAD421]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT446:%.*]] = insertelement <2 x double> poison, double [[TMP314]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT447:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT446]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP315:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK445]], <2 x double> [[SPLAT_SPLAT447]], <2 x double> [[TMP313]])
+; CHECK-NEXT: [[BLOCK448:%.*]] = shufflevector <4 x double> [[COL_LOAD420]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP316:%.*]] = extractelement <4 x double> [[COL_LOAD421]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT449:%.*]] = insertelement <2 x double> poison, double [[TMP316]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT450:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT449]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP317:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK448]], <2 x double> [[SPLAT_SPLAT450]], <2 x double> [[TMP315]])
+; CHECK-NEXT: [[TMP318:%.*]] = shufflevector <2 x double> [[TMP317]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP319:%.*]] = shufflevector <4 x double> [[TMP309]], <4 x double> [[TMP318]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[BLOCK451:%.*]] = shufflevector <4 x double> [[COL_LOAD414]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP320:%.*]] = extractelement <4 x double> [[COL_LOAD423]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT452:%.*]] = insertelement <2 x double> poison, double [[TMP320]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT453:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT452]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP321:%.*]] = fmul contract <2 x double> [[BLOCK451]], [[SPLAT_SPLAT453]]
+; CHECK-NEXT: [[BLOCK454:%.*]] = shufflevector <4 x double> [[COL_LOAD416]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP322:%.*]] = extractelement <4 x double> [[COL_LOAD423]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT455:%.*]] = insertelement <2 x double> poison, double [[TMP322]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT456:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT455]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP323:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK454]], <2 x double> [[SPLAT_SPLAT456]], <2 x double> [[TMP321]])
+; CHECK-NEXT: [[BLOCK457:%.*]] = shufflevector <4 x double> [[COL_LOAD418]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP324:%.*]] = extractelement <4 x double> [[COL_LOAD423]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT458:%.*]] = insertelement <2 x double> poison, double [[TMP324]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT459:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT458]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP325:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK457]], <2 x double> [[SPLAT_SPLAT459]], <2 x double> [[TMP323]])
+; CHECK-NEXT: [[BLOCK460:%.*]] = shufflevector <4 x double> [[COL_LOAD420]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP326:%.*]] = extractelement <4 x double> [[COL_LOAD423]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT461:%.*]] = insertelement <2 x double> poison, double [[TMP326]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT462:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT461]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP327:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK460]], <2 x double> [[SPLAT_SPLAT462]], <2 x double> [[TMP325]])
+; CHECK-NEXT: [[TMP328:%.*]] = shufflevector <2 x double> [[TMP327]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP329:%.*]] = shufflevector <4 x double> zeroinitializer, <4 x double> [[TMP328]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK463:%.*]] = shufflevector <4 x double> [[TMP329]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK464:%.*]] = shufflevector <4 x double> [[COL_LOAD414]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP330:%.*]] = extractelement <4 x double> [[COL_LOAD423]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT465:%.*]] = insertelement <2 x double> poison, double [[TMP330]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT466:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT465]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP331:%.*]] = fmul contract <2 x double> [[BLOCK464]], [[SPLAT_SPLAT466]]
+; CHECK-NEXT: [[BLOCK467:%.*]] = shufflevector <4 x double> [[COL_LOAD416]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP332:%.*]] = extractelement <4 x double> [[COL_LOAD423]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT468:%.*]] = insertelement <2 x double> poison, double [[TMP332]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT469:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT468]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP333:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK467]], <2 x double> [[SPLAT_SPLAT469]], <2 x double> [[TMP331]])
+; CHECK-NEXT: [[BLOCK470:%.*]] = shufflevector <4 x double> [[COL_LOAD418]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP334:%.*]] = extractelement <4 x double> [[COL_LOAD423]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT471:%.*]] = insertelement <2 x double> poison, double [[TMP334]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT472:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT471]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP335:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK470]], <2 x double> [[SPLAT_SPLAT472]], <2 x double> [[TMP333]])
+; CHECK-NEXT: [[BLOCK473:%.*]] = shufflevector <4 x double> [[COL_LOAD420]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP336:%.*]] = extractelement <4 x double> [[COL_LOAD423]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT474:%.*]] = insertelement <2 x double> poison, double [[TMP336]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT475:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT474]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP337:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK473]], <2 x double> [[SPLAT_SPLAT475]], <2 x double> [[TMP335]])
+; CHECK-NEXT: [[TMP338:%.*]] = shufflevector <2 x double> [[TMP337]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP339:%.*]] = shufflevector <4 x double> [[TMP329]], <4 x double> [[TMP338]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[BLOCK476:%.*]] = shufflevector <4 x double> [[COL_LOAD414]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP340:%.*]] = extractelement <4 x double> [[COL_LOAD425]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT477:%.*]] = insertelement <2 x double> poison, double [[TMP340]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT478:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT477]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP341:%.*]] = fmul contract <2 x double> [[BLOCK476]], [[SPLAT_SPLAT478]]
+; CHECK-NEXT: [[BLOCK479:%.*]] = shufflevector <4 x double> [[COL_LOAD416]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP342:%.*]] = extractelement <4 x double> [[COL_LOAD425]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT480:%.*]] = insertelement <2 x double> poison, double [[TMP342]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT481:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT480]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP343:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK479]], <2 x double> [[SPLAT_SPLAT481]], <2 x double> [[TMP341]])
+; CHECK-NEXT: [[BLOCK482:%.*]] = shufflevector <4 x double> [[COL_LOAD418]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP344:%.*]] = extractelement <4 x double> [[COL_LOAD425]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT483:%.*]] = insertelement <2 x double> poison, double [[TMP344]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT484:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT483]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP345:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK482]], <2 x double> [[SPLAT_SPLAT484]], <2 x double> [[TMP343]])
+; CHECK-NEXT: [[BLOCK485:%.*]] = shufflevector <4 x double> [[COL_LOAD420]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP346:%.*]] = extractelement <4 x double> [[COL_LOAD425]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT486:%.*]] = insertelement <2 x double> poison, double [[TMP346]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT487:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT486]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP347:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK485]], <2 x double> [[SPLAT_SPLAT487]], <2 x double> [[TMP345]])
+; CHECK-NEXT: [[TMP348:%.*]] = shufflevector <2 x double> [[TMP347]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP349:%.*]] = shufflevector <4 x double> zeroinitializer, <4 x double> [[TMP348]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK488:%.*]] = shufflevector <4 x double> [[TMP349]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK489:%.*]] = shufflevector <4 x double> [[COL_LOAD414]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP350:%.*]] = extractelement <4 x double> [[COL_LOAD425]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT490:%.*]] = insertelement <2 x double> poison, double [[TMP350]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT491:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT490]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP351:%.*]] = fmul contract <2 x double> [[BLOCK489]], [[SPLAT_SPLAT491]]
+; CHECK-NEXT: [[BLOCK492:%.*]] = shufflevector <4 x double> [[COL_LOAD416]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP352:%.*]] = extractelement <4 x double> [[COL_LOAD425]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT493:%.*]] = insertelement <2 x double> poison, double [[TMP352]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT494:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT493]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP353:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK492]], <2 x double> [[SPLAT_SPLAT494]], <2 x double> [[TMP351]])
+; CHECK-NEXT: [[BLOCK495:%.*]] = shufflevector <4 x double> [[COL_LOAD418]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP354:%.*]] = extractelement <4 x double> [[COL_LOAD425]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT496:%.*]] = insertelement <2 x double> poison, double [[TMP354]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT497:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT496]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP355:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK495]], <2 x double> [[SPLAT_SPLAT497]], <2 x double> [[TMP353]])
+; CHECK-NEXT: [[BLOCK498:%.*]] = shufflevector <4 x double> [[COL_LOAD420]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP356:%.*]] = extractelement <4 x double> [[COL_LOAD425]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT499:%.*]] = insertelement <2 x double> poison, double [[TMP356]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT500:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT499]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP357:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK498]], <2 x double> [[SPLAT_SPLAT500]], <2 x double> [[TMP355]])
+; CHECK-NEXT: [[TMP358:%.*]] = shufflevector <2 x double> [[TMP357]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP359:%.*]] = shufflevector <4 x double> [[TMP349]], <4 x double> [[TMP358]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[TMP360:%.*]] = getelementptr double, ptr [[A]], i64 28
+; CHECK-NEXT: [[COL_LOAD501:%.*]] = load <4 x double>, ptr [[TMP360]], align 8
+; CHECK-NEXT: [[VEC_GEP502:%.*]] = getelementptr double, ptr [[TMP360]], i64 7
+; CHECK-NEXT: [[COL_LOAD503:%.*]] = load <4 x double>, ptr [[VEC_GEP502]], align 8
+; CHECK-NEXT: [[VEC_GEP504:%.*]] = getelementptr double, ptr [[TMP360]], i64 14
+; CHECK-NEXT: [[COL_LOAD505:%.*]] = load <4 x double>, ptr [[VEC_GEP504]], align 8
+; CHECK-NEXT: [[TMP361:%.*]] = getelementptr double, ptr [[B]], i64 32
+; CHECK-NEXT: [[COL_LOAD506:%.*]] = load <3 x double>, ptr [[TMP361]], align 8
+; CHECK-NEXT: [[VEC_GEP507:%.*]] = getelementptr double, ptr [[TMP361]], i64 7
+; CHECK-NEXT: [[COL_LOAD508:%.*]] = load <3 x double>, ptr [[VEC_GEP507]], align 8
+; CHECK-NEXT: [[VEC_GEP509:%.*]] = getelementptr double, ptr [[TMP361]], i64 14
+; CHECK-NEXT: [[COL_LOAD510:%.*]] = load <3 x double>, ptr [[VEC_GEP509]], align 8
+; CHECK-NEXT: [[BLOCK511:%.*]] = shufflevector <4 x double> [[TMP319]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK512:%.*]] = shufflevector <4 x double> [[COL_LOAD501]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP362:%.*]] = extractelement <3 x double> [[COL_LOAD506]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT513:%.*]] = insertelement <2 x double> poison, double [[TMP362]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT514:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT513]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP363:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK512]], <2 x double> [[SPLAT_SPLAT514]], <2 x double> [[BLOCK511]])
+; CHECK-NEXT: [[BLOCK515:%.*]] = shufflevector <4 x double> [[COL_LOAD503]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP364:%.*]] = extractelement <3 x double> [[COL_LOAD506]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT516:%.*]] = insertelement <2 x double> poison, double [[TMP364]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT517:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT516]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP365:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK515]], <2 x double> [[SPLAT_SPLAT517]], <2 x double> [[TMP363]])
+; CHECK-NEXT: [[BLOCK518:%.*]] = shufflevector <4 x double> [[COL_LOAD505]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP366:%.*]] = extractelement <3 x double> [[COL_LOAD506]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT519:%.*]] = insertelement <2 x double> poison, double [[TMP366]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT520:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT519]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP367:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK518]], <2 x double> [[SPLAT_SPLAT520]], <2 x double> [[TMP365]])
+; CHECK-NEXT: [[TMP368:%.*]] = shufflevector <2 x double> [[TMP367]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP369:%.*]] = shufflevector <4 x double> [[TMP319]], <4 x double> [[TMP368]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK521:%.*]] = shufflevector <4 x double> [[TMP369]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK522:%.*]] = shufflevector <4 x double> [[COL_LOAD501]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP370:%.*]] = extractelement <3 x double> [[COL_LOAD506]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT523:%.*]] = insertelement <2 x double> poison, double [[TMP370]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT524:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT523]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP371:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK522]], <2 x double> [[SPLAT_SPLAT524]], <2 x double> [[BLOCK521]])
+; CHECK-NEXT: [[BLOCK525:%.*]] = shufflevector <4 x double> [[COL_LOAD503]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP372:%.*]] = extractelement <3 x double> [[COL_LOAD506]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT526:%.*]] = insertelement <2 x double> poison, double [[TMP372]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT527:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT526]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP373:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK525]], <2 x double> [[SPLAT_SPLAT527]], <2 x double> [[TMP371]])
+; CHECK-NEXT: [[BLOCK528:%.*]] = shufflevector <4 x double> [[COL_LOAD505]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP374:%.*]] = extractelement <3 x double> [[COL_LOAD506]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT529:%.*]] = insertelement <2 x double> poison, double [[TMP374]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT530:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT529]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP375:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK528]], <2 x double> [[SPLAT_SPLAT530]], <2 x double> [[TMP373]])
+; CHECK-NEXT: [[TMP376:%.*]] = shufflevector <2 x double> [[TMP375]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP377:%.*]] = shufflevector <4 x double> [[TMP369]], <4 x double> [[TMP376]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[BLOCK531:%.*]] = shufflevector <4 x double> [[TMP339]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK532:%.*]] = shufflevector <4 x double> [[COL_LOAD501]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP378:%.*]] = extractelement <3 x double> [[COL_LOAD508]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT533:%.*]] = insertelement <2 x double> poison, double [[TMP378]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT534:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT533]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP379:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK532]], <2 x double> [[SPLAT_SPLAT534]], <2 x double> [[BLOCK531]])
+; CHECK-NEXT: [[BLOCK535:%.*]] = shufflevector <4 x double> [[COL_LOAD503]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP380:%.*]] = extractelement <3 x double> [[COL_LOAD508]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT536:%.*]] = insertelement <2 x double> poison, double [[TMP380]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT537:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT536]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP381:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK535]], <2 x double> [[SPLAT_SPLAT537]], <2 x double> [[TMP379]])
+; CHECK-NEXT: [[BLOCK538:%.*]] = shufflevector <4 x double> [[COL_LOAD505]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP382:%.*]] = extractelement <3 x double> [[COL_LOAD508]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT539:%.*]] = insertelement <2 x double> poison, double [[TMP382]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT540:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT539]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP383:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK538]], <2 x double> [[SPLAT_SPLAT540]], <2 x double> [[TMP381]])
+; CHECK-NEXT: [[TMP384:%.*]] = shufflevector <2 x double> [[TMP383]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP385:%.*]] = shufflevector <4 x double> [[TMP339]], <4 x double> [[TMP384]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK541:%.*]] = shufflevector <4 x double> [[TMP385]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK542:%.*]] = shufflevector <4 x double> [[COL_LOAD501]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP386:%.*]] = extractelement <3 x double> [[COL_LOAD508]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT543:%.*]] = insertelement <2 x double> poison, double [[TMP386]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT544:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT543]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP387:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK542]], <2 x double> [[SPLAT_SPLAT544]], <2 x double> [[BLOCK541]])
+; CHECK-NEXT: [[BLOCK545:%.*]] = shufflevector <4 x double> [[COL_LOAD503]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP388:%.*]] = extractelement <3 x double> [[COL_LOAD508]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT546:%.*]] = insertelement <2 x double> poison, double [[TMP388]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT547:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT546]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP389:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK545]], <2 x double> [[SPLAT_SPLAT547]], <2 x double> [[TMP387]])
+; CHECK-NEXT: [[BLOCK548:%.*]] = shufflevector <4 x double> [[COL_LOAD505]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP390:%.*]] = extractelement <3 x double> [[COL_LOAD508]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT549:%.*]] = insertelement <2 x double> poison, double [[TMP390]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT550:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT549]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP391:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK548]], <2 x double> [[SPLAT_SPLAT550]], <2 x double> [[TMP389]])
+; CHECK-NEXT: [[TMP392:%.*]] = shufflevector <2 x double> [[TMP391]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP393:%.*]] = shufflevector <4 x double> [[TMP385]], <4 x double> [[TMP392]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[BLOCK551:%.*]] = shufflevector <4 x double> [[TMP359]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK552:%.*]] = shufflevector <4 x double> [[COL_LOAD501]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP394:%.*]] = extractelement <3 x double> [[COL_LOAD510]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT553:%.*]] = insertelement <2 x double> poison, double [[TMP394]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT554:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT553]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP395:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK552]], <2 x double> [[SPLAT_SPLAT554]], <2 x double> [[BLOCK551]])
+; CHECK-NEXT: [[BLOCK555:%.*]] = shufflevector <4 x double> [[COL_LOAD503]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP396:%.*]] = extractelement <3 x double> [[COL_LOAD510]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT556:%.*]] = insertelement <2 x double> poison, double [[TMP396]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT557:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT556]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP397:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK555]], <2 x double> [[SPLAT_SPLAT557]], <2 x double> [[TMP395]])
+; CHECK-NEXT: [[BLOCK558:%.*]] = shufflevector <4 x double> [[COL_LOAD505]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP398:%.*]] = extractelement <3 x double> [[COL_LOAD510]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT559:%.*]] = insertelement <2 x double> poison, double [[TMP398]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT560:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT559]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP399:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK558]], <2 x double> [[SPLAT_SPLAT560]], <2 x double> [[TMP397]])
+; CHECK-NEXT: [[TMP400:%.*]] = shufflevector <2 x double> [[TMP399]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP401:%.*]] = shufflevector <4 x double> [[TMP359]], <4 x double> [[TMP400]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK561:%.*]] = shufflevector <4 x double> [[TMP401]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[BLOCK562:%.*]] = shufflevector <4 x double> [[COL_LOAD501]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP402:%.*]] = extractelement <3 x double> [[COL_LOAD510]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT563:%.*]] = insertelement <2 x double> poison, double [[TMP402]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT564:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT563]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP403:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK562]], <2 x double> [[SPLAT_SPLAT564]], <2 x double> [[BLOCK561]])
+; CHECK-NEXT: [[BLOCK565:%.*]] = shufflevector <4 x double> [[COL_LOAD503]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP404:%.*]] = extractelement <3 x double> [[COL_LOAD510]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT566:%.*]] = insertelement <2 x double> poison, double [[TMP404]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT567:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT566]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP405:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK565]], <2 x double> [[SPLAT_SPLAT567]], <2 x double> [[TMP403]])
+; CHECK-NEXT: [[BLOCK568:%.*]] = shufflevector <4 x double> [[COL_LOAD505]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP406:%.*]] = extractelement <3 x double> [[COL_LOAD510]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT569:%.*]] = insertelement <2 x double> poison, double [[TMP406]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT570:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT569]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP407:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK568]], <2 x double> [[SPLAT_SPLAT570]], <2 x double> [[TMP405]])
+; CHECK-NEXT: [[TMP408:%.*]] = shufflevector <2 x double> [[TMP407]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP409:%.*]] = shufflevector <4 x double> [[TMP401]], <4 x double> [[TMP408]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[TMP410:%.*]] = getelementptr double, ptr [[C]], i64 28
+; CHECK-NEXT: store <4 x double> [[TMP377]], ptr [[TMP410]], align 8
+; CHECK-NEXT: [[VEC_GEP571:%.*]] = getelementptr double, ptr [[TMP410]], i64 7
+; CHECK-NEXT: store <4 x double> [[TMP393]], ptr [[VEC_GEP571]], align 8
+; CHECK-NEXT: [[VEC_GEP572:%.*]] = getelementptr double, ptr [[TMP410]], i64 14
+; CHECK-NEXT: store <4 x double> [[TMP409]], ptr [[VEC_GEP572]], align 8
+; CHECK-NEXT: [[TMP411:%.*]] = getelementptr double, ptr [[A]], i64 4
+; CHECK-NEXT: [[COL_LOAD573:%.*]] = load <3 x double>, ptr [[TMP411]], align 8
+; CHECK-NEXT: [[VEC_GEP574:%.*]] = getelementptr double, ptr [[TMP411]], i64 7
+; CHECK-NEXT: [[COL_LOAD575:%.*]] = load <3 x double>, ptr [[VEC_GEP574]], align 8
+; CHECK-NEXT: [[VEC_GEP576:%.*]] = getelementptr double, ptr [[TMP411]], i64 14
+; CHECK-NEXT: [[COL_LOAD577:%.*]] = load <3 x double>, ptr [[VEC_GEP576]], align 8
+; CHECK-NEXT: [[VEC_GEP578:%.*]] = getelementptr double, ptr [[TMP411]], i64 21
+; CHECK-NEXT: [[COL_LOAD579:%.*]] = load <3 x double>, ptr [[VEC_GEP578]], align 8
+; CHECK-NEXT: [[TMP412:%.*]] = getelementptr double, ptr [[B]], i64 28
+; CHECK-NEXT: [[COL_LOAD580:%.*]] = load <4 x double>, ptr [[TMP412]], align 8
+; CHECK-NEXT: [[VEC_GEP581:%.*]] = getelementptr double, ptr [[TMP412]], i64 7
+; CHECK-NEXT: [[COL_LOAD582:%.*]] = load <4 x double>, ptr [[VEC_GEP581]], align 8
+; CHECK-NEXT: [[VEC_GEP583:%.*]] = getelementptr double, ptr [[TMP412]], i64 14
+; CHECK-NEXT: [[COL_LOAD584:%.*]] = load <4 x double>, ptr [[VEC_GEP583]], align 8
+; CHECK-NEXT: [[BLOCK585:%.*]] = shufflevector <3 x double> [[COL_LOAD573]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP413:%.*]] = extractelement <4 x double> [[COL_LOAD580]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT586:%.*]] = insertelement <2 x double> poison, double [[TMP413]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT587:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT586]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP414:%.*]] = fmul contract <2 x double> [[BLOCK585]], [[SPLAT_SPLAT587]]
+; CHECK-NEXT: [[BLOCK588:%.*]] = shufflevector <3 x double> [[COL_LOAD575]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP415:%.*]] = extractelement <4 x double> [[COL_LOAD580]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT589:%.*]] = insertelement <2 x double> poison, double [[TMP415]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT590:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT589]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP416:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK588]], <2 x double> [[SPLAT_SPLAT590]], <2 x double> [[TMP414]])
+; CHECK-NEXT: [[BLOCK591:%.*]] = shufflevector <3 x double> [[COL_LOAD577]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP417:%.*]] = extractelement <4 x double> [[COL_LOAD580]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT592:%.*]] = insertelement <2 x double> poison, double [[TMP417]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT593:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT592]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP418:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK591]], <2 x double> [[SPLAT_SPLAT593]], <2 x double> [[TMP416]])
+; CHECK-NEXT: [[BLOCK594:%.*]] = shufflevector <3 x double> [[COL_LOAD579]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP419:%.*]] = extractelement <4 x double> [[COL_LOAD580]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT595:%.*]] = insertelement <2 x double> poison, double [[TMP419]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT596:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT595]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP420:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK594]], <2 x double> [[SPLAT_SPLAT596]], <2 x double> [[TMP418]])
+; CHECK-NEXT: [[TMP421:%.*]] = shufflevector <2 x double> [[TMP420]], <2 x double> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT: [[TMP422:%.*]] = shufflevector <3 x double> zeroinitializer, <3 x double> [[TMP421]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT: [[BLOCK597:%.*]] = shufflevector <3 x double> [[TMP422]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[BLOCK598:%.*]] = shufflevector <3 x double> [[COL_LOAD573]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP423:%.*]] = extractelement <4 x double> [[COL_LOAD580]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT599:%.*]] = insertelement <1 x double> poison, double [[TMP423]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT600:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT599]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP424:%.*]] = fmul contract <1 x double> [[BLOCK598]], [[SPLAT_SPLAT600]]
+; CHECK-NEXT: [[BLOCK601:%.*]] = shufflevector <3 x double> [[COL_LOAD575]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP425:%.*]] = extractelement <4 x double> [[COL_LOAD580]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT602:%.*]] = insertelement <1 x double> poison, double [[TMP425]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT603:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT602]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP426:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK601]], <1 x double> [[SPLAT_SPLAT603]], <1 x double> [[TMP424]])
+; CHECK-NEXT: [[BLOCK604:%.*]] = shufflevector <3 x double> [[COL_LOAD577]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP427:%.*]] = extractelement <4 x double> [[COL_LOAD580]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT605:%.*]] = insertelement <1 x double> poison, double [[TMP427]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT606:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT605]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP428:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK604]], <1 x double> [[SPLAT_SPLAT606]], <1 x double> [[TMP426]])
+; CHECK-NEXT: [[BLOCK607:%.*]] = shufflevector <3 x double> [[COL_LOAD579]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP429:%.*]] = extractelement <4 x double> [[COL_LOAD580]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT608:%.*]] = insertelement <1 x double> poison, double [[TMP429]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT609:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT608]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP430:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK607]], <1 x double> [[SPLAT_SPLAT609]], <1 x double> [[TMP428]])
+; CHECK-NEXT: [[TMP431:%.*]] = shufflevector <1 x double> [[TMP430]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP432:%.*]] = shufflevector <3 x double> [[TMP422]], <3 x double> [[TMP431]], <3 x i32> <i32 0, i32 1, i32 3>
+; CHECK-NEXT: [[BLOCK610:%.*]] = shufflevector <3 x double> [[COL_LOAD573]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP433:%.*]] = extractelement <4 x double> [[COL_LOAD582]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT611:%.*]] = insertelement <2 x double> poison, double [[TMP433]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT612:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT611]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP434:%.*]] = fmul contract <2 x double> [[BLOCK610]], [[SPLAT_SPLAT612]]
+; CHECK-NEXT: [[BLOCK613:%.*]] = shufflevector <3 x double> [[COL_LOAD575]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP435:%.*]] = extractelement <4 x double> [[COL_LOAD582]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT614:%.*]] = insertelement <2 x double> poison, double [[TMP435]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT615:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT614]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP436:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK613]], <2 x double> [[SPLAT_SPLAT615]], <2 x double> [[TMP434]])
+; CHECK-NEXT: [[BLOCK616:%.*]] = shufflevector <3 x double> [[COL_LOAD577]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP437:%.*]] = extractelement <4 x double> [[COL_LOAD582]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT617:%.*]] = insertelement <2 x double> poison, double [[TMP437]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT618:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT617]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP438:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK616]], <2 x double> [[SPLAT_SPLAT618]], <2 x double> [[TMP436]])
+; CHECK-NEXT: [[BLOCK619:%.*]] = shufflevector <3 x double> [[COL_LOAD579]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP439:%.*]] = extractelement <4 x double> [[COL_LOAD582]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT620:%.*]] = insertelement <2 x double> poison, double [[TMP439]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT621:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT620]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP440:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK619]], <2 x double> [[SPLAT_SPLAT621]], <2 x double> [[TMP438]])
+; CHECK-NEXT: [[TMP441:%.*]] = shufflevector <2 x double> [[TMP440]], <2 x double> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT: [[TMP442:%.*]] = shufflevector <3 x double> zeroinitializer, <3 x double> [[TMP441]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT: [[BLOCK622:%.*]] = shufflevector <3 x double> [[TMP442]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[BLOCK623:%.*]] = shufflevector <3 x double> [[COL_LOAD573]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP443:%.*]] = extractelement <4 x double> [[COL_LOAD582]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT624:%.*]] = insertelement <1 x double> poison, double [[TMP443]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT625:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT624]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP444:%.*]] = fmul contract <1 x double> [[BLOCK623]], [[SPLAT_SPLAT625]]
+; CHECK-NEXT: [[BLOCK626:%.*]] = shufflevector <3 x double> [[COL_LOAD575]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP445:%.*]] = extractelement <4 x double> [[COL_LOAD582]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT627:%.*]] = insertelement <1 x double> poison, double [[TMP445]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT628:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT627]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP446:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK626]], <1 x double> [[SPLAT_SPLAT628]], <1 x double> [[TMP444]])
+; CHECK-NEXT: [[BLOCK629:%.*]] = shufflevector <3 x double> [[COL_LOAD577]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP447:%.*]] = extractelement <4 x double> [[COL_LOAD582]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT630:%.*]] = insertelement <1 x double> poison, double [[TMP447]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT631:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT630]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP448:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK629]], <1 x double> [[SPLAT_SPLAT631]], <1 x double> [[TMP446]])
+; CHECK-NEXT: [[BLOCK632:%.*]] = shufflevector <3 x double> [[COL_LOAD579]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP449:%.*]] = extractelement <4 x double> [[COL_LOAD582]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT633:%.*]] = insertelement <1 x double> poison, double [[TMP449]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT634:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT633]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP450:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK632]], <1 x double> [[SPLAT_SPLAT634]], <1 x double> [[TMP448]])
+; CHECK-NEXT: [[TMP451:%.*]] = shufflevector <1 x double> [[TMP450]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP452:%.*]] = shufflevector <3 x double> [[TMP442]], <3 x double> [[TMP451]], <3 x i32> <i32 0, i32 1, i32 3>
+; CHECK-NEXT: [[BLOCK635:%.*]] = shufflevector <3 x double> [[COL_LOAD573]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP453:%.*]] = extractelement <4 x double> [[COL_LOAD584]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT636:%.*]] = insertelement <2 x double> poison, double [[TMP453]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT637:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT636]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP454:%.*]] = fmul contract <2 x double> [[BLOCK635]], [[SPLAT_SPLAT637]]
+; CHECK-NEXT: [[BLOCK638:%.*]] = shufflevector <3 x double> [[COL_LOAD575]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP455:%.*]] = extractelement <4 x double> [[COL_LOAD584]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT639:%.*]] = insertelement <2 x double> poison, double [[TMP455]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT640:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT639]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP456:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK638]], <2 x double> [[SPLAT_SPLAT640]], <2 x double> [[TMP454]])
+; CHECK-NEXT: [[BLOCK641:%.*]] = shufflevector <3 x double> [[COL_LOAD577]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP457:%.*]] = extractelement <4 x double> [[COL_LOAD584]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT642:%.*]] = insertelement <2 x double> poison, double [[TMP457]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT643:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT642]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP458:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK641]], <2 x double> [[SPLAT_SPLAT643]], <2 x double> [[TMP456]])
+; CHECK-NEXT: [[BLOCK644:%.*]] = shufflevector <3 x double> [[COL_LOAD579]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP459:%.*]] = extractelement <4 x double> [[COL_LOAD584]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT645:%.*]] = insertelement <2 x double> poison, double [[TMP459]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT646:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT645]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP460:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK644]], <2 x double> [[SPLAT_SPLAT646]], <2 x double> [[TMP458]])
+; CHECK-NEXT: [[TMP461:%.*]] = shufflevector <2 x double> [[TMP460]], <2 x double> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT: [[TMP462:%.*]] = shufflevector <3 x double> zeroinitializer, <3 x double> [[TMP461]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT: [[BLOCK647:%.*]] = shufflevector <3 x double> [[TMP462]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[BLOCK648:%.*]] = shufflevector <3 x double> [[COL_LOAD573]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP463:%.*]] = extractelement <4 x double> [[COL_LOAD584]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT649:%.*]] = insertelement <1 x double> poison, double [[TMP463]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT650:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT649]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP464:%.*]] = fmul contract <1 x double> [[BLOCK648]], [[SPLAT_SPLAT650]]
+; CHECK-NEXT: [[BLOCK651:%.*]] = shufflevector <3 x double> [[COL_LOAD575]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP465:%.*]] = extractelement <4 x double> [[COL_LOAD584]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT652:%.*]] = insertelement <1 x double> poison, double [[TMP465]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT653:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT652]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP466:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK651]], <1 x double> [[SPLAT_SPLAT653]], <1 x double> [[TMP464]])
+; CHECK-NEXT: [[BLOCK654:%.*]] = shufflevector <3 x double> [[COL_LOAD577]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP467:%.*]] = extractelement <4 x double> [[COL_LOAD584]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT655:%.*]] = insertelement <1 x double> poison, double [[TMP467]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT656:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT655]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP468:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK654]], <1 x double> [[SPLAT_SPLAT656]], <1 x double> [[TMP466]])
+; CHECK-NEXT: [[BLOCK657:%.*]] = shufflevector <3 x double> [[COL_LOAD579]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP469:%.*]] = extractelement <4 x double> [[COL_LOAD584]], i64 3
+; CHECK-NEXT: [[SPLAT_SPLATINSERT658:%.*]] = insertelement <1 x double> poison, double [[TMP469]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT659:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT658]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP470:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK657]], <1 x double> [[SPLAT_SPLAT659]], <1 x double> [[TMP468]])
+; CHECK-NEXT: [[TMP471:%.*]] = shufflevector <1 x double> [[TMP470]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP472:%.*]] = shufflevector <3 x double> [[TMP462]], <3 x double> [[TMP471]], <3 x i32> <i32 0, i32 1, i32 3>
+; CHECK-NEXT: [[TMP473:%.*]] = getelementptr double, ptr [[A]], i64 32
+; CHECK-NEXT: [[COL_LOAD660:%.*]] = load <3 x double>, ptr [[TMP473]], align 8
+; CHECK-NEXT: [[VEC_GEP661:%.*]] = getelementptr double, ptr [[TMP473]], i64 7
+; CHECK-NEXT: [[COL_LOAD662:%.*]] = load <3 x double>, ptr [[VEC_GEP661]], align 8
+; CHECK-NEXT: [[VEC_GEP663:%.*]] = getelementptr double, ptr [[TMP473]], i64 14
+; CHECK-NEXT: [[COL_LOAD664:%.*]] = load <3 x double>, ptr [[VEC_GEP663]], align 8
+; CHECK-NEXT: [[TMP474:%.*]] = getelementptr double, ptr [[B]], i64 32
+; CHECK-NEXT: [[COL_LOAD665:%.*]] = load <3 x double>, ptr [[TMP474]], align 8
+; CHECK-NEXT: [[VEC_GEP666:%.*]] = getelementptr double, ptr [[TMP474]], i64 7
+; CHECK-NEXT: [[COL_LOAD667:%.*]] = load <3 x double>, ptr [[VEC_GEP666]], align 8
+; CHECK-NEXT: [[VEC_GEP668:%.*]] = getelementptr double, ptr [[TMP474]], i64 14
+; CHECK-NEXT: [[COL_LOAD669:%.*]] = load <3 x double>, ptr [[VEC_GEP668]], align 8
+; CHECK-NEXT: [[BLOCK670:%.*]] = shufflevector <3 x double> [[TMP432]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK671:%.*]] = shufflevector <3 x double> [[COL_LOAD660]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP475:%.*]] = extractelement <3 x double> [[COL_LOAD665]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT672:%.*]] = insertelement <2 x double> poison, double [[TMP475]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT673:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT672]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP476:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK671]], <2 x double> [[SPLAT_SPLAT673]], <2 x double> [[BLOCK670]])
+; CHECK-NEXT: [[BLOCK674:%.*]] = shufflevector <3 x double> [[COL_LOAD662]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP477:%.*]] = extractelement <3 x double> [[COL_LOAD665]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT675:%.*]] = insertelement <2 x double> poison, double [[TMP477]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT676:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT675]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP478:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK674]], <2 x double> [[SPLAT_SPLAT676]], <2 x double> [[TMP476]])
+; CHECK-NEXT: [[BLOCK677:%.*]] = shufflevector <3 x double> [[COL_LOAD664]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP479:%.*]] = extractelement <3 x double> [[COL_LOAD665]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT678:%.*]] = insertelement <2 x double> poison, double [[TMP479]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT679:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT678]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP480:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK677]], <2 x double> [[SPLAT_SPLAT679]], <2 x double> [[TMP478]])
+; CHECK-NEXT: [[TMP481:%.*]] = shufflevector <2 x double> [[TMP480]], <2 x double> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT: [[TMP482:%.*]] = shufflevector <3 x double> [[TMP432]], <3 x double> [[TMP481]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT: [[BLOCK680:%.*]] = shufflevector <3 x double> [[TMP482]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[BLOCK681:%.*]] = shufflevector <3 x double> [[COL_LOAD660]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP483:%.*]] = extractelement <3 x double> [[COL_LOAD665]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT682:%.*]] = insertelement <1 x double> poison, double [[TMP483]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT683:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT682]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP484:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK681]], <1 x double> [[SPLAT_SPLAT683]], <1 x double> [[BLOCK680]])
+; CHECK-NEXT: [[BLOCK684:%.*]] = shufflevector <3 x double> [[COL_LOAD662]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP485:%.*]] = extractelement <3 x double> [[COL_LOAD665]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT685:%.*]] = insertelement <1 x double> poison, double [[TMP485]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT686:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT685]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP486:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK684]], <1 x double> [[SPLAT_SPLAT686]], <1 x double> [[TMP484]])
+; CHECK-NEXT: [[BLOCK687:%.*]] = shufflevector <3 x double> [[COL_LOAD664]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP487:%.*]] = extractelement <3 x double> [[COL_LOAD665]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT688:%.*]] = insertelement <1 x double> poison, double [[TMP487]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT689:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT688]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP488:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK687]], <1 x double> [[SPLAT_SPLAT689]], <1 x double> [[TMP486]])
+; CHECK-NEXT: [[TMP489:%.*]] = shufflevector <1 x double> [[TMP488]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP490:%.*]] = shufflevector <3 x double> [[TMP482]], <3 x double> [[TMP489]], <3 x i32> <i32 0, i32 1, i32 3>
+; CHECK-NEXT: [[BLOCK690:%.*]] = shufflevector <3 x double> [[TMP452]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK691:%.*]] = shufflevector <3 x double> [[COL_LOAD660]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP491:%.*]] = extractelement <3 x double> [[COL_LOAD667]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT692:%.*]] = insertelement <2 x double> poison, double [[TMP491]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT693:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT692]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP492:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK691]], <2 x double> [[SPLAT_SPLAT693]], <2 x double> [[BLOCK690]])
+; CHECK-NEXT: [[BLOCK694:%.*]] = shufflevector <3 x double> [[COL_LOAD662]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP493:%.*]] = extractelement <3 x double> [[COL_LOAD667]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT695:%.*]] = insertelement <2 x double> poison, double [[TMP493]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT696:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT695]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP494:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK694]], <2 x double> [[SPLAT_SPLAT696]], <2 x double> [[TMP492]])
+; CHECK-NEXT: [[BLOCK697:%.*]] = shufflevector <3 x double> [[COL_LOAD664]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP495:%.*]] = extractelement <3 x double> [[COL_LOAD667]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT698:%.*]] = insertelement <2 x double> poison, double [[TMP495]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT699:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT698]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP496:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK697]], <2 x double> [[SPLAT_SPLAT699]], <2 x double> [[TMP494]])
+; CHECK-NEXT: [[TMP497:%.*]] = shufflevector <2 x double> [[TMP496]], <2 x double> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT: [[TMP498:%.*]] = shufflevector <3 x double> [[TMP452]], <3 x double> [[TMP497]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT: [[BLOCK700:%.*]] = shufflevector <3 x double> [[TMP498]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[BLOCK701:%.*]] = shufflevector <3 x double> [[COL_LOAD660]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP499:%.*]] = extractelement <3 x double> [[COL_LOAD667]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT702:%.*]] = insertelement <1 x double> poison, double [[TMP499]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT703:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT702]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP500:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK701]], <1 x double> [[SPLAT_SPLAT703]], <1 x double> [[BLOCK700]])
+; CHECK-NEXT: [[BLOCK704:%.*]] = shufflevector <3 x double> [[COL_LOAD662]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP501:%.*]] = extractelement <3 x double> [[COL_LOAD667]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT705:%.*]] = insertelement <1 x double> poison, double [[TMP501]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT706:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT705]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP502:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK704]], <1 x double> [[SPLAT_SPLAT706]], <1 x double> [[TMP500]])
+; CHECK-NEXT: [[BLOCK707:%.*]] = shufflevector <3 x double> [[COL_LOAD664]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP503:%.*]] = extractelement <3 x double> [[COL_LOAD667]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT708:%.*]] = insertelement <1 x double> poison, double [[TMP503]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT709:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT708]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP504:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK707]], <1 x double> [[SPLAT_SPLAT709]], <1 x double> [[TMP502]])
+; CHECK-NEXT: [[TMP505:%.*]] = shufflevector <1 x double> [[TMP504]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP506:%.*]] = shufflevector <3 x double> [[TMP498]], <3 x double> [[TMP505]], <3 x i32> <i32 0, i32 1, i32 3>
+; CHECK-NEXT: [[BLOCK710:%.*]] = shufflevector <3 x double> [[TMP472]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[BLOCK711:%.*]] = shufflevector <3 x double> [[COL_LOAD660]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP507:%.*]] = extractelement <3 x double> [[COL_LOAD669]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT712:%.*]] = insertelement <2 x double> poison, double [[TMP507]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT713:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT712]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP508:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK711]], <2 x double> [[SPLAT_SPLAT713]], <2 x double> [[BLOCK710]])
+; CHECK-NEXT: [[BLOCK714:%.*]] = shufflevector <3 x double> [[COL_LOAD662]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP509:%.*]] = extractelement <3 x double> [[COL_LOAD669]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT715:%.*]] = insertelement <2 x double> poison, double [[TMP509]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT716:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT715]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP510:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK714]], <2 x double> [[SPLAT_SPLAT716]], <2 x double> [[TMP508]])
+; CHECK-NEXT: [[BLOCK717:%.*]] = shufflevector <3 x double> [[COL_LOAD664]], <3 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP511:%.*]] = extractelement <3 x double> [[COL_LOAD669]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT718:%.*]] = insertelement <2 x double> poison, double [[TMP511]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT719:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT718]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP512:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK717]], <2 x double> [[SPLAT_SPLAT719]], <2 x double> [[TMP510]])
+; CHECK-NEXT: [[TMP513:%.*]] = shufflevector <2 x double> [[TMP512]], <2 x double> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT: [[TMP514:%.*]] = shufflevector <3 x double> [[TMP472]], <3 x double> [[TMP513]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT: [[BLOCK720:%.*]] = shufflevector <3 x double> [[TMP514]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[BLOCK721:%.*]] = shufflevector <3 x double> [[COL_LOAD660]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP515:%.*]] = extractelement <3 x double> [[COL_LOAD669]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLATINSERT722:%.*]] = insertelement <1 x double> poison, double [[TMP515]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT723:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT722]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP516:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK721]], <1 x double> [[SPLAT_SPLAT723]], <1 x double> [[BLOCK720]])
+; CHECK-NEXT: [[BLOCK724:%.*]] = shufflevector <3 x double> [[COL_LOAD662]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP517:%.*]] = extractelement <3 x double> [[COL_LOAD669]], i64 1
+; CHECK-NEXT: [[SPLAT_SPLATINSERT725:%.*]] = insertelement <1 x double> poison, double [[TMP517]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT726:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT725]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP518:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK724]], <1 x double> [[SPLAT_SPLAT726]], <1 x double> [[TMP516]])
+; CHECK-NEXT: [[BLOCK727:%.*]] = shufflevector <3 x double> [[COL_LOAD664]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT: [[TMP519:%.*]] = extractelement <3 x double> [[COL_LOAD669]], i64 2
+; CHECK-NEXT: [[SPLAT_SPLATINSERT728:%.*]] = insertelement <1 x double> poison, double [[TMP519]], i64 0
+; CHECK-NEXT: [[SPLAT_SPLAT729:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT728]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP520:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK727]], <1 x double> [[SPLAT_SPLAT729]], <1 x double> [[TMP518]])
+; CHECK-NEXT: [[TMP521:%.*]] = shufflevector <1 x double> [[TMP520]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP522:%.*]] = shufflevector <3 x double> [[TMP514]], <3 x double> [[TMP521]], <3 x i32> <i32 0, i32 1, i32 3>
+; CHECK-NEXT: [[TMP523:%.*]] = getelementptr double, ptr [[C]], i64 32
+; CHECK-NEXT: store <3 x double> [[TMP490]], ptr [[TMP523]], align 8
+; CHECK-NEXT: [[VEC_GEP730:%.*]] = getelementptr double, ptr [[TMP523]], i64 7
+; CHECK-NEXT: store <3 x double> [[TMP506]], ptr [[VEC_GEP730]], align 8
+; CHECK-NEXT: [[VEC_GEP731:%.*]] = getelementptr double, ptr [[TMP523]], i64 14
+; CHECK-NEXT: store <3 x double> [[TMP522]], ptr [[VEC_GEP731]], align 8
+; CHECK-NEXT: ret void
+;
+entry:
+ %a = load <49 x double>, ptr %A, align 8
+ %b = load <49 x double>, ptr %B, align 8
+ %c = call <49 x double> @llvm.matrix.multiply.v49f64.v49f64.v49f64(<49 x double> %a, <49 x double> %b, i32 7, i32 7, i32 7)
+ store <49 x double> %c, ptr %C, align 8
+ ret void
+}
More information about the llvm-commits
mailing list