[llvm] [msan] Handle Arm NEON BFloat16 multiply-add to single-precision (PR #178510)
Thurston Dang via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 28 12:47:34 PST 2026
https://github.com/thurstond created https://github.com/llvm/llvm-project/pull/178510
aarch64.neon.bfmlalb/t perform dot-products after zeroing out the odd/even-indexed values. We handle these by generalizing handleVectorDotProductIntrinsic() and (mis-)use getPclmulMask().
>From 31d6ca25c18cbe5b91d3811c5ed857df5a518a05 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Wed, 28 Jan 2026 20:38:59 +0000
Subject: [PATCH] [msan] Handle Arm NEON BFloat16 multiply-add to
single-precision
aarch64.neon.bfmlalb/t are dot-products after zeroing out the
odd/even-indexed values. We handle these by generalizing handleVectorDotProductIntrinsic() and (mis-)use
getPclmulMask().
---
.../Instrumentation/MemorySanitizer.cpp | 49 +++++-
.../aarch64-bf16-dotprod-intrinsics.ll | 162 ++++++++----------
2 files changed, 120 insertions(+), 91 deletions(-)
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 8371e7a009d18..0c07859c8949c 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3961,7 +3961,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
void handleVectorDotProductIntrinsic(IntrinsicInst &I,
unsigned ReductionFactor,
bool ZeroPurifies,
- unsigned EltSizeInBits = 0) {
+ unsigned EltSizeInBits = 0,
+ bool UseEvenLanes = true,
+ bool UseOddLanes = true) {
IRBuilder<> IRB(&I);
[[maybe_unused]] FixedVectorType *ReturnType =
@@ -3974,6 +3976,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
Value *Sa = nullptr;
Value *Sb = nullptr;
+ assert(UseEvenLanes || UseOddLanes);
+
assert(I.arg_size() == 2 || I.arg_size() == 3);
if (I.arg_size() == 2) {
Va = I.getOperand(0);
@@ -3981,6 +3985,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
Sa = getShadow(&I, 0);
Sb = getShadow(&I, 1);
+
+ assert(UseEvenLanes && UseOddLanes);
} else if (I.arg_size() == 3) {
// Operand 0 is the accumulator. We will deal with that below.
Va = I.getOperand(1);
@@ -3988,6 +3994,26 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
Sa = getShadow(&I, 1);
Sb = getShadow(&I, 2);
+
+ if (UseEvenLanes && UseOddLanes) {
+ // Default
+ } else if (UseEvenLanes || UseOddLanes) {
+ // Convert < S0, S1, S2, S3, S4, S5, S6, S7 >
+ // to < S0, S0, S2, S2, S4, S4, S6, S6 > (if even)
+ // to < S1, S1, S3, S3, S5, S5, S7, S7 > (if odd)
+ //
+ // Note: for aarch64.neon.bfmlalb/t, the odd/even-indexed values are
+ // zeroed, not duplicated. However, for shadow propagation, this
+ // distinction is unimportant because Step 1 below will squeeze
+ // each pair of elements (e.g., [S0, S0]) into a single bit, and
+ // we only care if it is fully initialized.
+
+ FixedVectorType* InputShadowType = cast<FixedVectorType>(Sa->getType());
+ unsigned Width = InputShadowType->getNumElements();
+
+ Sa = IRB.CreateShuffleVector(Sa, getPclmulMask(Width, /*OddElements=*/UseOddLanes));
+ Sb = IRB.CreateShuffleVector(Sb, getPclmulMask(Width, /*OddElements=*/UseOddLanes));
+ }
}
FixedVectorType *ParamType = cast<FixedVectorType>(Va->getType());
@@ -5942,6 +5968,27 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
/*EltSizeInBits=*/16);
break;
+ // BFloat16 multiply-add to single-precision
+ // <4 x float> llvm.aarch64.neon.bfmlalt
+ // (<4 x float>, <8 x bfloat>, <8 x bfloat>)
+ case Intrinsic::aarch64_neon_bfmlalt:
+ handleVectorDotProductIntrinsic(I, /*ReductionFactor=*/2,
+ /*ZeroPurifies=*/false,
+ /*EltSizeInBits=*/0,
+ /*UseEvenLanes=*/false,
+ /*UseOddLanes=*/true);
+ break;
+
+ // <4 x float> llvm.aarch64.neon.bfmlalb
+ // (<4 x float>, <8 x bfloat>, <8 x bfloat>)
+ case Intrinsic::aarch64_neon_bfmlalb:
+ handleVectorDotProductIntrinsic(I, /*ReductionFactor=*/2,
+ /*ZeroPurifies=*/false,
+ /*EltSizeInBits=*/0,
+ /*UseEvenLanes=*/true,
+ /*UseOddLanes=*/false);
+ break;
+
// AVX Vector Neural Network Instructions: bytes
//
// Multiply and Add Signed Bytes
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/aarch64-bf16-dotprod-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/aarch64-bf16-dotprod-intrinsics.ll
index ad55c896fff72..c24947a16c8b7 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/aarch64-bf16-dotprod-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/aarch64-bf16-dotprod-intrinsics.ll
@@ -233,25 +233,22 @@ define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfl
; CHECK-LABEL: define <4 x float> @test_vbfmlalbq_f32(
; CHECK-SAME: <4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK: [[BB6]]:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT: unreachable
-; CHECK: [[BB7]]:
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i1> [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = sext <8 x i1> [[TMP7]] to <8 x i16>
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <4 x i32>
+; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <4 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
+; CHECK-NEXT: [[TMP12:%.*]] = or <4 x i32> [[TMP11]], [[TMP13]]
; CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
-; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: store <4 x i32> [[TMP12]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
;
entry:
@@ -263,25 +260,22 @@ define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfl
; CHECK-LABEL: define <4 x float> @test_vbfmlaltq_f32(
; CHECK-SAME: <4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK: [[BB6]]:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT: unreachable
-; CHECK: [[BB7]]:
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i1> [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = sext <8 x i1> [[TMP7]] to <8 x i16>
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <4 x i32>
+; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <4 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
+; CHECK-NEXT: [[TMP12:%.*]] = or <4 x i32> [[TMP11]], [[TMP13]]
; CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
-; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: store <4 x i32> [[TMP12]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
;
entry:
@@ -294,26 +288,23 @@ define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4
; CHECK-SAME: <4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <4 x bfloat> [[B:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> splat (i16 -1), <8 x i32> zeroinitializer
; CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B]], <4 x bfloat> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[_MSPROP]] to i128
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK: [[BB6]]:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT: unreachable
-; CHECK: [[BB7]]:
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[_MSPROP]], <8 x i16> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i1> [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = sext <8 x i1> [[TMP7]] to <8 x i16>
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <4 x i32>
+; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <4 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
+; CHECK-NEXT: [[TMP12:%.*]] = or <4 x i32> [[TMP11]], [[TMP13]]
; CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
-; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: store <4 x i32> [[TMP12]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
;
entry:
@@ -327,26 +318,23 @@ define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8
; CHECK-SAME: <4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> splat (i16 -1), <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
; CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B]], <8 x bfloat> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[_MSPROP]] to i128
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK: [[BB6]]:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT: unreachable
-; CHECK: [[BB7]]:
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[_MSPROP]], <8 x i16> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i1> [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = sext <8 x i1> [[TMP7]] to <8 x i16>
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <4 x i32>
+; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <4 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
+; CHECK-NEXT: [[TMP12:%.*]] = or <4 x i32> [[TMP11]], [[TMP13]]
; CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
-; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: store <4 x i32> [[TMP12]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
;
entry:
@@ -360,26 +348,23 @@ define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4
; CHECK-SAME: <4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <4 x bfloat> [[B:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> splat (i16 -1), <8 x i32> zeroinitializer
; CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B]], <4 x bfloat> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[_MSPROP]] to i128
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK: [[BB6]]:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT: unreachable
-; CHECK: [[BB7]]:
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[_MSPROP]], <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i1> [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = sext <8 x i1> [[TMP7]] to <8 x i16>
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <4 x i32>
+; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <4 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
+; CHECK-NEXT: [[TMP12:%.*]] = or <4 x i32> [[TMP11]], [[TMP13]]
; CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
-; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: store <4 x i32> [[TMP12]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
;
entry:
@@ -393,26 +378,23 @@ define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8
; CHECK-SAME: <4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> splat (i16 -1), <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
; CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B]], <8 x bfloat> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[_MSPROP]] to i128
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK: [[BB6]]:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT: unreachable
-; CHECK: [[BB7]]:
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[_MSPROP]], <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i1> [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = sext <8 x i1> [[TMP7]] to <8 x i16>
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <4 x i32>
+; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <4 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
+; CHECK-NEXT: [[TMP12:%.*]] = or <4 x i32> [[TMP11]], [[TMP13]]
; CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
-; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: store <4 x i32> [[TMP12]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
;
entry:
More information about the llvm-commits
mailing list