[llvm] [msan] Add test for Arm NEON vmul (PR #117935)
Thurston Dang via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 27 14:46:26 PST 2024
https://github.com/thurstond created https://github.com/llvm/llvm-project/pull/117935
None
>From d94eed4f88781accca53ba8aa251cd0079fbd9e3 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Wed, 27 Nov 2024 22:45:26 +0000
Subject: [PATCH] [msan] Add test for Arm NEON vmul
---
.../MemorySanitizer/AArch64/neon_vmul.ll | 3834 +++++++++++++++++
1 file changed, 3834 insertions(+)
create mode 100644 llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vmul.ll
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vmul.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vmul.ll
new file mode 100644
index 00000000000000..4861d9aafc17f1
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vmul.ll
@@ -0,0 +1,3834 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=msan -S | FileCheck %s
+;
+; Test memory sanitizer instrumentation for Arm vector multiplication
+; instructions.
+;
+; Forked from llvm/test/CodeGen/AArch64/arm64-vmul.ll
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android9001"
+
+define <8 x i16> @smull8h(ptr %A, ptr %B) nounwind {
+; CHECK-LABEL: define <8 x i16> @smull8h(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
+; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i16> [[TMP3]]
+;
+ %temp1 = load <8 x i8>, ptr %A
+ %temp2 = load <8 x i8>, ptr %B
+ %temp3 = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %temp1, <8 x i8> %temp2)
+ ret <8 x i16> %temp3
+}
+
+define <4 x i32> @smull4s(ptr %A, ptr %B) nounwind {
+; CHECK-LABEL: define <4 x i32> @smull4s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP3]]
+;
+ %temp1 = load <4 x i16>, ptr %A
+ %temp2 = load <4 x i16>, ptr %B
+ %temp3 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
+ ret <4 x i32> %temp3
+}
+
+define <2 x i64> @smull2d(ptr %A, ptr %B) nounwind {
+; CHECK-LABEL: define <2 x i64> @smull2d(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[TMP3]]
+;
+ %temp1 = load <2 x i32>, ptr %A
+ %temp2 = load <2 x i32>, ptr %B
+ %temp3 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
+ ret <2 x i64> %temp3
+}
+
+declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+define <8 x i16> @umull8h(ptr %A, ptr %B) nounwind {
+; CHECK-LABEL: define <8 x i16> @umull8h(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
+; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i16> [[TMP3]]
+;
+ %temp1 = load <8 x i8>, ptr %A
+ %temp2 = load <8 x i8>, ptr %B
+ %temp3 = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %temp1, <8 x i8> %temp2)
+ ret <8 x i16> %temp3
+}
+
+define <4 x i32> @umull4s(ptr %A, ptr %B) nounwind {
+; CHECK-LABEL: define <4 x i32> @umull4s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP3]]
+;
+ %temp1 = load <4 x i16>, ptr %A
+ %temp2 = load <4 x i16>, ptr %B
+ %temp3 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
+ ret <4 x i32> %temp3
+}
+
+define <2 x i64> @umull2d(ptr %A, ptr %B) nounwind {
+; CHECK-LABEL: define <2 x i64> @umull2d(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[TMP3]]
+;
+ %temp1 = load <2 x i32>, ptr %A
+ %temp2 = load <2 x i32>, ptr %B
+ %temp3 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
+ ret <2 x i64> %temp3
+}
+
+declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+define <4 x i32> @sqdmull4s(ptr %A, ptr %B) nounwind {
+; CHECK-LABEL: define <4 x i32> @sqdmull4s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP3]]
+;
+ %temp1 = load <4 x i16>, ptr %A
+ %temp2 = load <4 x i16>, ptr %B
+ %temp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
+ ret <4 x i32> %temp3
+}
+
+define <2 x i64> @sqdmull2d(ptr %A, ptr %B) nounwind {
+; CHECK-LABEL: define <2 x i64> @sqdmull2d(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[TMP3]]
+;
+ %temp1 = load <2 x i32>, ptr %A
+ %temp2 = load <2 x i32>, ptr %B
+ %temp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
+ ret <2 x i64> %temp3
+}
+
+define <4 x i32> @sqdmull2_4s(ptr %A, ptr %B) nounwind {
+; CHECK-LABEL: define <4 x i32> @sqdmull2_4s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[LOAD1:%.*]] = load <8 x i16>, ptr [[A]], align 16
+; CHECK-NEXT: [[LOAD2:%.*]] = load <8 x i16>, ptr [[B]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[LOAD1]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[LOAD2]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP3]]
+;
+ %load1 = load <8 x i16>, ptr %A
+ %load2 = load <8 x i16>, ptr %B
+ %temp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %temp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %temp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
+ ret <4 x i32> %temp3
+}
+
+define <2 x i64> @sqdmull2_2d(ptr %A, ptr %B) nounwind {
+; CHECK-LABEL: define <2 x i64> @sqdmull2_2d(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[LOAD1:%.*]] = load <4 x i32>, ptr [[A]], align 16
+; CHECK-NEXT: [[LOAD2:%.*]] = load <4 x i32>, ptr [[B]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[LOAD1]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[LOAD2]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[TMP3]]
+;
+ %load1 = load <4 x i32>, ptr %A
+ %load2 = load <4 x i32>, ptr %B
+ %temp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %temp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %temp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
+ ret <2 x i64> %temp3
+}
+
+
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+define <8 x i16> @pmull8h(ptr %A, ptr %B) nounwind {
+; CHECK-LABEL: define <8 x i16> @pmull8h(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
+; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i16> [[TMP3]]
+;
+ %temp1 = load <8 x i8>, ptr %A
+ %temp2 = load <8 x i8>, ptr %B
+ %temp3 = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %temp1, <8 x i8> %temp2)
+ ret <8 x i16> %temp3
+}
+
+declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <4 x i16> @sqdmulh_4h(ptr %A, ptr %B) nounwind {
+; CHECK-LABEL: define <4 x i16> @sqdmulh_4h(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
+; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i16> [[TMP3]]
+;
+ %temp1 = load <4 x i16>, ptr %A
+ %temp2 = load <4 x i16>, ptr %B
+ %temp3 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %temp1, <4 x i16> %temp2)
+ ret <4 x i16> %temp3
+}
+
+define <8 x i16> @sqdmulh_8h(ptr %A, ptr %B) nounwind {
+; CHECK-LABEL: define <8 x i16> @sqdmulh_8h(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[A]], align 16
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[B]], align 16
+; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i16> [[TMP3]]
+;
+ %temp1 = load <8 x i16>, ptr %A
+ %temp2 = load <8 x i16>, ptr %B
+ %temp3 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %temp1, <8 x i16> %temp2)
+ ret <8 x i16> %temp3
+}
+
+define <2 x i32> @sqdmulh_2s(ptr %A, ptr %B) nounwind {
+; CHECK-LABEL: define <2 x i32> @sqdmulh_2s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
+; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i32> [[TMP3]]
+;
+ %temp1 = load <2 x i32>, ptr %A
+ %temp2 = load <2 x i32>, ptr %B
+ %temp3 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %temp1, <2 x i32> %temp2)
+ ret <2 x i32> %temp3
+}
+
+define <4 x i32> @sqdmulh_4s(ptr %A, ptr %B) nounwind {
+; CHECK-LABEL: define <4 x i32> @sqdmulh_4s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[A]], align 16
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[B]], align 16
+; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP3]]
+;
+ %temp1 = load <4 x i32>, ptr %A
+ %temp2 = load <4 x i32>, ptr %B
+ %temp3 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %temp1, <4 x i32> %temp2)
+ ret <4 x i32> %temp3
+}
+
+define i32 @sqdmulh_1s(ptr %A, ptr %B) nounwind {
+; CHECK-LABEL: define i32 @sqdmulh_1s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[B]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 [[TMP1]], i32 [[TMP2]])
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %temp1 = load i32, ptr %A
+ %temp2 = load i32, ptr %B
+ %temp3 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %temp1, i32 %temp2)
+ ret i32 %temp3
+}
+
+declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqdmulh.i32(i32, i32) nounwind readnone
+
+define <4 x i16> @sqrdmulh_4h(ptr %A, ptr %B) nounwind {
+; CHECK-LABEL: define <4 x i16> @sqrdmulh_4h(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
+; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i16> [[TMP3]]
+;
+ %temp1 = load <4 x i16>, ptr %A
+ %temp2 = load <4 x i16>, ptr %B
+ %temp3 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %temp1, <4 x i16> %temp2)
+ ret <4 x i16> %temp3
+}
+
+define <8 x i16> @sqrdmulh_8h(ptr %A, ptr %B) nounwind {
+; CHECK-LABEL: define <8 x i16> @sqrdmulh_8h(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[A]], align 16
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[B]], align 16
+; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i16> [[TMP3]]
+;
+ %temp1 = load <8 x i16>, ptr %A
+ %temp2 = load <8 x i16>, ptr %B
+ %temp3 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %temp1, <8 x i16> %temp2)
+ ret <8 x i16> %temp3
+}
+
+define <2 x i32> @sqrdmulh_2s(ptr %A, ptr %B) nounwind {
+; CHECK-LABEL: define <2 x i32> @sqrdmulh_2s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
+; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i32> [[TMP3]]
+;
+ %temp1 = load <2 x i32>, ptr %A
+ %temp2 = load <2 x i32>, ptr %B
+ %temp3 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %temp1, <2 x i32> %temp2)
+ ret <2 x i32> %temp3
+}
+
+define <4 x i32> @sqrdmulh_4s(ptr %A, ptr %B) nounwind {
+; CHECK-LABEL: define <4 x i32> @sqrdmulh_4s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[A]], align 16
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[B]], align 16
+; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP3]]
+;
+ %temp1 = load <4 x i32>, ptr %A
+ %temp2 = load <4 x i32>, ptr %B
+ %temp3 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %temp1, <4 x i32> %temp2)
+ ret <4 x i32> %temp3
+}
+
+define i32 @sqrdmulh_1s(ptr %A, ptr %B) nounwind {
+; CHECK-LABEL: define i32 @sqrdmulh_1s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[B]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[TMP1]], i32 [[TMP2]])
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %temp1 = load i32, ptr %A
+ %temp2 = load i32, ptr %B
+ %temp3 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %temp1, i32 %temp2)
+ ret i32 %temp3
+}
+
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32) nounwind readnone
+
+define <2 x float> @fmulx_2s(ptr %A, ptr %B) nounwind {
+; CHECK-LABEL: define <2 x float> @fmulx_2s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[TMP1]], <2 x float> [[TMP2]])
+; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x float> [[TMP3]]
+;
+ %temp1 = load <2 x float>, ptr %A
+ %temp2 = load <2 x float>, ptr %B
+ %temp3 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %temp1, <2 x float> %temp2)
+ ret <2 x float> %temp3
+}
+
+define <4 x float> @fmulx_4s(ptr %A, ptr %B) nounwind {
+; CHECK-LABEL: define <4 x float> @fmulx_4s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[A]], align 16
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B]], align 16
+; CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[TMP3]]
+;
+ %temp1 = load <4 x float>, ptr %A
+ %temp2 = load <4 x float>, ptr %B
+ %temp3 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %temp1, <4 x float> %temp2)
+ ret <4 x float> %temp3
+}
+
+define <2 x double> @fmulx_2d(ptr %A, ptr %B) nounwind {
+; CHECK-LABEL: define <2 x double> @fmulx_2d(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[A]], align 16
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[B]], align 16
+; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[TMP3]]
+;
+ %temp1 = load <2 x double>, ptr %A
+ %temp2 = load <2 x double>, ptr %B
+ %temp3 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %temp1, <2 x double> %temp2)
+ ret <2 x double> %temp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x i32> @smlal4s(ptr %A, ptr %B, ptr %C) nounwind {
+; CHECK-LABEL: define <4 x i32> @smlal4s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[C]], align 16
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
+; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP3]], [[TMP4]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP5]]
+;
+ %temp1 = load <4 x i16>, ptr %A
+ %temp2 = load <4 x i16>, ptr %B
+ %temp3 = load <4 x i32>, ptr %C
+ %temp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
+ %temp5 = add <4 x i32> %temp3, %temp4
+ ret <4 x i32> %temp5
+}
+
+define <2 x i64> @smlal2d(ptr %A, ptr %B, ptr %C) nounwind {
+; CHECK-LABEL: define <2 x i64> @smlal2d(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[C]], align 16
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
+; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[TMP5]]
+;
+ %temp1 = load <2 x i32>, ptr %A
+ %temp2 = load <2 x i32>, ptr %B
+ %temp3 = load <2 x i64>, ptr %C
+ %temp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
+ %temp5 = add <2 x i64> %temp3, %temp4
+ ret <2 x i64> %temp5
+}
+
+define void @smlal8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
+; CHECK-LABEL: define void @smlal8h_chain_with_constant(
+; CHECK-SAME: ptr [[DST:%.*]], <8 x i8> [[V1:%.*]], <8 x i8> [[V2:%.*]], <8 x i8> [[V3:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[XOR:%.*]] = xor <8 x i8> [[V3]], splat (i8 -1)
+; CHECK-NEXT: [[SMULL_1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[V1]], <8 x i8> [[V3]])
+; CHECK-NEXT: [[ADD_1:%.*]] = add <8 x i16> [[SMULL_1]], splat (i16 257)
+; CHECK-NEXT: [[SMULL_2:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[V2]], <8 x i8> [[XOR]])
+; CHECK-NEXT: [[ADD_2:%.*]] = add <8 x i16> [[ADD_1]], [[SMULL_2]]
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr [[TMP3]], align 16
+; CHECK-NEXT: store <8 x i16> [[ADD_2]], ptr [[DST]], align 16
+; CHECK-NEXT: ret void
+;
+ %xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ %smull.1 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %v1, <8 x i8> %v3)
+ %add.1 = add <8 x i16> %smull.1, <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>
+ %smull.2 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %v2, <8 x i8> %xor)
+ %add.2 = add <8 x i16> %add.1, %smull.2
+ store <8 x i16> %add.2, ptr %dst
+ ret void
+}
+
+define void @smlal2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
+; CHECK-LABEL: define void @smlal2d_chain_with_constant(
+; CHECK-SAME: ptr [[DST:%.*]], <2 x i32> [[V1:%.*]], <2 x i32> [[V2:%.*]], <2 x i32> [[V3:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[XOR:%.*]] = xor <2 x i32> [[V3]], splat (i32 -1)
+; CHECK-NEXT: [[SMULL_1:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[V1]], <2 x i32> [[V3]])
+; CHECK-NEXT: [[ADD_1:%.*]] = add <2 x i64> [[SMULL_1]], splat (i64 257)
+; CHECK-NEXT: [[SMULL_2:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[V2]], <2 x i32> [[XOR]])
+; CHECK-NEXT: [[ADD_2:%.*]] = add <2 x i64> [[ADD_1]], [[SMULL_2]]
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr [[TMP3]], align 16
+; CHECK-NEXT: store <2 x i64> [[ADD_2]], ptr [[DST]], align 16
+; CHECK-NEXT: ret void
+;
+ %xor = xor <2 x i32> %v3, <i32 -1, i32 -1>
+ %smull.1 = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %v1, <2 x i32> %v3)
+ %add.1 = add <2 x i64> %smull.1, <i64 257, i64 257>
+ %smull.2 = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %v2, <2 x i32> %xor)
+ %add.2 = add <2 x i64> %add.1, %smull.2
+ store <2 x i64> %add.2, ptr %dst
+ ret void
+}
+
+define <4 x i32> @smlsl4s(ptr %A, ptr %B, ptr %C) nounwind {
+; CHECK-LABEL: define <4 x i32> @smlsl4s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[C]], align 16
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
+; CHECK-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[TMP3]], [[TMP4]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP5]]
+;
+ %temp1 = load <4 x i16>, ptr %A
+ %temp2 = load <4 x i16>, ptr %B
+ %temp3 = load <4 x i32>, ptr %C
+ %temp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
+ %temp5 = sub <4 x i32> %temp3, %temp4
+ ret <4 x i32> %temp5
+}
+
+define <2 x i64> @smlsl2d(ptr %A, ptr %B, ptr %C) nounwind {
+; CHECK-LABEL: define <2 x i64> @smlsl2d(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[C]], align 16
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
+; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[TMP3]], [[TMP4]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[TMP5]]
+;
+ %temp1 = load <2 x i32>, ptr %A
+ %temp2 = load <2 x i32>, ptr %B
+ %temp3 = load <2 x i64>, ptr %C
+ %temp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
+ %temp5 = sub <2 x i64> %temp3, %temp4
+ ret <2 x i64> %temp5
+}
+
+define void @smlsl8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
+; CHECK-LABEL: define void @smlsl8h_chain_with_constant(
+; CHECK-SAME: ptr [[DST:%.*]], <8 x i8> [[V1:%.*]], <8 x i8> [[V2:%.*]], <8 x i8> [[V3:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[XOR:%.*]] = xor <8 x i8> [[V3]], splat (i8 -1)
+; CHECK-NEXT: [[SMULL_1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[V1]], <8 x i8> [[V3]])
+; CHECK-NEXT: [[SUB_1:%.*]] = sub <8 x i16> splat (i16 257), [[SMULL_1]]
+; CHECK-NEXT: [[SMULL_2:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[V2]], <8 x i8> [[XOR]])
+; CHECK-NEXT: [[SUB_2:%.*]] = sub <8 x i16> [[SUB_1]], [[SMULL_2]]
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr [[TMP3]], align 16
+; CHECK-NEXT: store <8 x i16> [[SUB_2]], ptr [[DST]], align 16
+; CHECK-NEXT: ret void
+;
+ %xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ %smull.1 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %v1, <8 x i8> %v3)
+ %sub.1 = sub <8 x i16> <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>, %smull.1
+ %smull.2 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %v2, <8 x i8> %xor)
+ %sub.2 = sub <8 x i16> %sub.1, %smull.2
+ store <8 x i16> %sub.2, ptr %dst
+ ret void
+}
+
+define void @smlsl2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
+; CHECK-LABEL: define void @smlsl2d_chain_with_constant(
+; CHECK-SAME: ptr [[DST:%.*]], <2 x i32> [[V1:%.*]], <2 x i32> [[V2:%.*]], <2 x i32> [[V3:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[XOR:%.*]] = xor <2 x i32> [[V3]], splat (i32 -1)
+; CHECK-NEXT: [[SMULL_1:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[V1]], <2 x i32> [[V3]])
+; CHECK-NEXT: [[SUB_1:%.*]] = sub <2 x i64> splat (i64 257), [[SMULL_1]]
+; CHECK-NEXT: [[SMULL_2:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[V2]], <2 x i32> [[XOR]])
+; CHECK-NEXT: [[SUB_2:%.*]] = sub <2 x i64> [[SUB_1]], [[SMULL_2]]
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr [[TMP3]], align 16
+; CHECK-NEXT: store <2 x i64> [[SUB_2]], ptr [[DST]], align 16
+; CHECK-NEXT: ret void
+;
+ %xor = xor <2 x i32> %v3, <i32 -1, i32 -1>
+ %smull.1 = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %v1, <2 x i32> %v3)
+ %sub.1 = sub <2 x i64> <i64 257, i64 257>, %smull.1
+ %smull.2 = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %v2, <2 x i32> %xor)
+ %sub.2 = sub <2 x i64> %sub.1, %smull.2
+ store <2 x i64> %sub.2, ptr %dst
+ ret void
+}
+
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
+
+define <4 x i32> @sqdmlal4s(ptr %A, ptr %B, ptr %C) nounwind {
+; CHECK-LABEL: define <4 x i32> @sqdmlal4s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[C]], align 16
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
+; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP4]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP5]]
+;
+ %temp1 = load <4 x i16>, ptr %A
+ %temp2 = load <4 x i16>, ptr %B
+ %temp3 = load <4 x i32>, ptr %C
+ %temp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
+ %temp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %temp3, <4 x i32> %temp4)
+ ret <4 x i32> %temp5
+}
+
+define <2 x i64> @sqdmlal2d(ptr %A, ptr %B, ptr %C) nounwind {
+; CHECK-LABEL: define <2 x i64> @sqdmlal2d(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[C]], align 16
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
+; CHECK-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[TMP5]]
+;
+ %temp1 = load <2 x i32>, ptr %A
+ %temp2 = load <2 x i32>, ptr %B
+ %temp3 = load <2 x i64>, ptr %C
+ %temp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
+ %temp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %temp3, <2 x i64> %temp4)
+ ret <2 x i64> %temp5
+}
+
+define <4 x i32> @sqdmlal2_4s(ptr %A, ptr %B, ptr %C) nounwind {
+; CHECK-LABEL: define <4 x i32> @sqdmlal2_4s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[LOAD1:%.*]] = load <8 x i16>, ptr [[A]], align 16
+; CHECK-NEXT: [[LOAD2:%.*]] = load <8 x i16>, ptr [[B]], align 16
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[C]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[LOAD1]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[LOAD2]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
+; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP4]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP5]]
+;
+ %load1 = load <8 x i16>, ptr %A
+ %load2 = load <8 x i16>, ptr %B
+ %temp3 = load <4 x i32>, ptr %C
+ %temp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %temp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %temp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
+ %temp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %temp3, <4 x i32> %temp4)
+ ret <4 x i32> %temp5
+}
+
+define <2 x i64> @sqdmlal2_2d(ptr %A, ptr %B, ptr %C) nounwind {
+; CHECK-LABEL: define <2 x i64> @sqdmlal2_2d(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[LOAD1:%.*]] = load <4 x i32>, ptr [[A]], align 16
+; CHECK-NEXT: [[LOAD2:%.*]] = load <4 x i32>, ptr [[B]], align 16
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[C]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[LOAD1]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[LOAD2]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
+; CHECK-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[TMP5]]
+;
+ %load1 = load <4 x i32>, ptr %A
+ %load2 = load <4 x i32>, ptr %B
+ %temp3 = load <2 x i64>, ptr %C
+ %temp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %temp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %temp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
+ %temp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %temp3, <2 x i64> %temp4)
+ ret <2 x i64> %temp5
+}
+
+define <4 x i32> @sqdmlsl4s(ptr %A, ptr %B, ptr %C) nounwind {
+; CHECK-LABEL: define <4 x i32> @sqdmlsl4s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[C]], align 16
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
+; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP4]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP5]]
+;
+ %temp1 = load <4 x i16>, ptr %A
+ %temp2 = load <4 x i16>, ptr %B
+ %temp3 = load <4 x i32>, ptr %C
+ %temp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
+ %temp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %temp3, <4 x i32> %temp4)
+ ret <4 x i32> %temp5
+}
+
+define <2 x i64> @sqdmlsl2d(ptr %A, ptr %B, ptr %C) nounwind {
+; CHECK-LABEL: define <2 x i64> @sqdmlsl2d(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[C]], align 16
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
+; CHECK-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[TMP5]]
+;
+ %temp1 = load <2 x i32>, ptr %A
+ %temp2 = load <2 x i32>, ptr %B
+ %temp3 = load <2 x i64>, ptr %C
+ %temp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
+ %temp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %temp3, <2 x i64> %temp4)
+ ret <2 x i64> %temp5
+}
+
+define <4 x i32> @sqdmlsl2_4s(ptr %A, ptr %B, ptr %C) nounwind {
+; CHECK-LABEL: define <4 x i32> @sqdmlsl2_4s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[LOAD1:%.*]] = load <8 x i16>, ptr [[A]], align 16
+; CHECK-NEXT: [[LOAD2:%.*]] = load <8 x i16>, ptr [[B]], align 16
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[C]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[LOAD1]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[LOAD2]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
+; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP4]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP5]]
+;
+ %load1 = load <8 x i16>, ptr %A
+ %load2 = load <8 x i16>, ptr %B
+ %temp3 = load <4 x i32>, ptr %C
+ %temp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %temp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %temp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
+ %temp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %temp3, <4 x i32> %temp4)
+ ret <4 x i32> %temp5
+}
+
+define <2 x i64> @sqdmlsl2_2d(ptr %A, ptr %B, ptr %C) nounwind {
+; CHECK-LABEL: define <2 x i64> @sqdmlsl2_2d(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[LOAD1:%.*]] = load <4 x i32>, ptr [[A]], align 16
+; CHECK-NEXT: [[LOAD2:%.*]] = load <4 x i32>, ptr [[B]], align 16
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[C]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[LOAD1]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[LOAD2]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
+; CHECK-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[TMP5]]
+;
+ %load1 = load <4 x i32>, ptr %A
+ %load2 = load <4 x i32>, ptr %B
+ %temp3 = load <2 x i64>, ptr %C
+ %temp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %temp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %temp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
+ %temp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %temp3, <2 x i64> %temp4)
+ ret <2 x i64> %temp5
+}
+
+define <4 x i32> @umlal4s(ptr %A, ptr %B, ptr %C) nounwind {
+; CHECK-LABEL: define <4 x i32> @umlal4s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[C]], align 16
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
+; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP3]], [[TMP4]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP5]]
+;
+ %temp1 = load <4 x i16>, ptr %A
+ %temp2 = load <4 x i16>, ptr %B
+ %temp3 = load <4 x i32>, ptr %C
+ %temp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
+ %temp5 = add <4 x i32> %temp3, %temp4
+ ret <4 x i32> %temp5
+}
+
+define <2 x i64> @umlal2d(ptr %A, ptr %B, ptr %C) nounwind {
+; CHECK-LABEL: define <2 x i64> @umlal2d(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[C]], align 16
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
+; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[TMP5]]
+;
+ %temp1 = load <2 x i32>, ptr %A
+ %temp2 = load <2 x i32>, ptr %B
+ %temp3 = load <2 x i64>, ptr %C
+ %temp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
+ %temp5 = add <2 x i64> %temp3, %temp4
+ ret <2 x i64> %temp5
+}
+
+define void @umlal8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
+; CHECK-LABEL: define void @umlal8h_chain_with_constant(
+; CHECK-SAME: ptr [[DST:%.*]], <8 x i8> [[V1:%.*]], <8 x i8> [[V2:%.*]], <8 x i8> [[V3:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[XOR:%.*]] = xor <8 x i8> [[V3]], splat (i8 -1)
+; CHECK-NEXT: [[UMULL_1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[V1]], <8 x i8> [[V3]])
+; CHECK-NEXT: [[ADD_1:%.*]] = add <8 x i16> [[UMULL_1]], splat (i16 257)
+; CHECK-NEXT: [[UMULL_2:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[V2]], <8 x i8> [[XOR]])
+; CHECK-NEXT: [[ADD_2:%.*]] = add <8 x i16> [[ADD_1]], [[UMULL_2]]
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr [[TMP3]], align 16
+; CHECK-NEXT: store <8 x i16> [[ADD_2]], ptr [[DST]], align 16
+; CHECK-NEXT: ret void
+;
+ %xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ %umull.1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %v1, <8 x i8> %v3)
+ %add.1 = add <8 x i16> %umull.1, <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>
+ %umull.2 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %v2, <8 x i8> %xor)
+ %add.2 = add <8 x i16> %add.1, %umull.2
+ store <8 x i16> %add.2, ptr %dst
+ ret void
+}
+
+define void @umlal2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
+; CHECK-LABEL: define void @umlal2d_chain_with_constant(
+; CHECK-SAME: ptr [[DST:%.*]], <2 x i32> [[V1:%.*]], <2 x i32> [[V2:%.*]], <2 x i32> [[V3:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[XOR:%.*]] = xor <2 x i32> [[V3]], splat (i32 -1)
+; CHECK-NEXT: [[UMULL_1:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[V1]], <2 x i32> [[V3]])
+; CHECK-NEXT: [[ADD_1:%.*]] = add <2 x i64> [[UMULL_1]], splat (i64 257)
+; CHECK-NEXT: [[UMULL_2:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[V2]], <2 x i32> [[XOR]])
+; CHECK-NEXT: [[ADD_2:%.*]] = add <2 x i64> [[ADD_1]], [[UMULL_2]]
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr [[TMP3]], align 16
+; CHECK-NEXT: store <2 x i64> [[ADD_2]], ptr [[DST]], align 16
+; CHECK-NEXT: ret void
+;
+ %xor = xor <2 x i32> %v3, <i32 -1, i32 -1>
+ %umull.1 = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %v1, <2 x i32> %v3)
+ %add.1 = add <2 x i64> %umull.1, <i64 257, i64 257>
+ %umull.2 = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %v2, <2 x i32> %xor)
+ %add.2 = add <2 x i64> %add.1, %umull.2
+ store <2 x i64> %add.2, ptr %dst
+ ret void
+}
+
+define <4 x i32> @umlsl4s(ptr %A, ptr %B, ptr %C) nounwind {
+; CHECK-LABEL: define <4 x i32> @umlsl4s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[C]], align 16
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
+; CHECK-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[TMP3]], [[TMP4]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP5]]
+;
+ %temp1 = load <4 x i16>, ptr %A
+ %temp2 = load <4 x i16>, ptr %B
+ %temp3 = load <4 x i32>, ptr %C
+ %temp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
+ %temp5 = sub <4 x i32> %temp3, %temp4
+ ret <4 x i32> %temp5
+}
+
+define <2 x i64> @umlsl2d(ptr %A, ptr %B, ptr %C) nounwind {
+; CHECK-LABEL: define <2 x i64> @umlsl2d(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[C]], align 16
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
+; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[TMP3]], [[TMP4]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[TMP5]]
+;
+ %temp1 = load <2 x i32>, ptr %A
+ %temp2 = load <2 x i32>, ptr %B
+ %temp3 = load <2 x i64>, ptr %C
+ %temp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
+ %temp5 = sub <2 x i64> %temp3, %temp4
+ ret <2 x i64> %temp5
+}
+
+define void @umlsl8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
+; CHECK-LABEL: define void @umlsl8h_chain_with_constant(
+; CHECK-SAME: ptr [[DST:%.*]], <8 x i8> [[V1:%.*]], <8 x i8> [[V2:%.*]], <8 x i8> [[V3:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[XOR:%.*]] = xor <8 x i8> [[V3]], splat (i8 -1)
+; CHECK-NEXT: [[UMULL_1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[V1]], <8 x i8> [[V3]])
+; CHECK-NEXT: [[ADD_1:%.*]] = sub <8 x i16> splat (i16 257), [[UMULL_1]]
+; CHECK-NEXT: [[UMULL_2:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[V2]], <8 x i8> [[XOR]])
+; CHECK-NEXT: [[ADD_2:%.*]] = sub <8 x i16> [[ADD_1]], [[UMULL_2]]
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr [[TMP3]], align 16
+; CHECK-NEXT: store <8 x i16> [[ADD_2]], ptr [[DST]], align 16
+; CHECK-NEXT: ret void
+;
+ %xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ %umull.1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %v1, <8 x i8> %v3)
+ %add.1 = sub <8 x i16> <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>, %umull.1
+ %umull.2 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %v2, <8 x i8> %xor)
+ %add.2 = sub <8 x i16> %add.1, %umull.2
+ store <8 x i16> %add.2, ptr %dst
+ ret void
+}
+
+define void @umlsl2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
+; CHECK-LABEL: define void @umlsl2d_chain_with_constant(
+; CHECK-SAME: ptr [[DST:%.*]], <2 x i32> [[V1:%.*]], <2 x i32> [[V2:%.*]], <2 x i32> [[V3:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[XOR:%.*]] = xor <2 x i32> [[V3]], splat (i32 -1)
+; CHECK-NEXT: [[UMULL_1:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[V1]], <2 x i32> [[V3]])
+; CHECK-NEXT: [[ADD_1:%.*]] = sub <2 x i64> splat (i64 257), [[UMULL_1]]
+; CHECK-NEXT: [[UMULL_2:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[V2]], <2 x i32> [[XOR]])
+; CHECK-NEXT: [[ADD_2:%.*]] = sub <2 x i64> [[ADD_1]], [[UMULL_2]]
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr [[TMP3]], align 16
+; CHECK-NEXT: store <2 x i64> [[ADD_2]], ptr [[DST]], align 16
+; CHECK-NEXT: ret void
+;
+ %xor = xor <2 x i32> %v3, <i32 -1, i32 -1>
+ %umull.1 = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %v1, <2 x i32> %v3)
+ %add.1 = sub <2 x i64> <i64 257, i64 257>, %umull.1
+ %umull.2 = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %v2, <2 x i32> %xor)
+ %add.2 = sub <2 x i64> %add.1, %umull.2
+ store <2 x i64> %add.2, ptr %dst
+ ret void
+}
+
+define <2 x float> @fmla_2s(ptr %A, ptr %B, ptr %C) nounwind {
+; CHECK-LABEL: define <2 x float> @fmla_2s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr [[C]], align 8
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x float> [[TMP3]])
+; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x float> [[TMP4]]
+;
+ %temp1 = load <2 x float>, ptr %A
+ %temp2 = load <2 x float>, ptr %B
+ %temp3 = load <2 x float>, ptr %C
+ %temp4 = call <2 x float> @llvm.fma.v2f32(<2 x float> %temp1, <2 x float> %temp2, <2 x float> %temp3)
+ ret <2 x float> %temp4
+}
+
+define <4 x float> @fmla_4s(ptr %A, ptr %B, ptr %C) nounwind {
+; CHECK-LABEL: define <4 x float> @fmla_4s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[A]], align 16
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B]], align 16
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[C]], align 16
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x float> [[TMP3]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[TMP4]]
+;
+ %temp1 = load <4 x float>, ptr %A
+ %temp2 = load <4 x float>, ptr %B
+ %temp3 = load <4 x float>, ptr %C
+ %temp4 = call <4 x float> @llvm.fma.v4f32(<4 x float> %temp1, <4 x float> %temp2, <4 x float> %temp3)
+ ret <4 x float> %temp4
+}
+
+define <2 x double> @fmla_2d(ptr %A, ptr %B, ptr %C) nounwind {
+; CHECK-LABEL: define <2 x double> @fmla_2d(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[A]], align 16
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[B]], align 16
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[C]], align 16
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[TMP4]]
+;
+ %temp1 = load <2 x double>, ptr %A
+ %temp2 = load <2 x double>, ptr %B
+ %temp3 = load <2 x double>, ptr %C
+ %temp4 = call <2 x double> @llvm.fma.v2f64(<2 x double> %temp1, <2 x double> %temp2, <2 x double> %temp3)
+ ret <2 x double> %temp4
+}
+
+declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fmls_2s(ptr %A, ptr %B, ptr %C) nounwind {
+; CHECK-LABEL: define <2 x float> @fmls_2s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr [[C]], align 8
+; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x float> splat (float -0.000000e+00), [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP1]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
+; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x float> [[TMP5]]
+;
+ %temp1 = load <2 x float>, ptr %A
+ %temp2 = load <2 x float>, ptr %B
+ %temp3 = load <2 x float>, ptr %C
+ %temp4 = fsub <2 x float> <float -0.0, float -0.0>, %temp2
+ %temp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %temp1, <2 x float> %temp4, <2 x float> %temp3)
+ ret <2 x float> %temp5
+}
+
+define <4 x float> @fmls_4s(ptr %A, ptr %B, ptr %C) nounwind {
+; CHECK-LABEL: define <4 x float> @fmls_4s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[A]], align 16
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B]], align 16
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[C]], align 16
+; CHECK-NEXT: [[TMP4:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[TMP5]]
+;
+ %temp1 = load <4 x float>, ptr %A
+ %temp2 = load <4 x float>, ptr %B
+ %temp3 = load <4 x float>, ptr %C
+ %temp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %temp2
+ %temp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %temp1, <4 x float> %temp4, <4 x float> %temp3)
+ ret <4 x float> %temp5
+}
+
+define <2 x double> @fmls_2d(ptr %A, ptr %B, ptr %C) nounwind {
+; CHECK-LABEL: define <2 x double> @fmls_2d(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[A]], align 16
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[B]], align 16
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[C]], align 16
+; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[TMP5]]
+;
+ %temp1 = load <2 x double>, ptr %A
+ %temp2 = load <2 x double>, ptr %B
+ %temp3 = load <2 x double>, ptr %C
+ %temp4 = fsub <2 x double> <double -0.0, double -0.0>, %temp2
+ %temp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %temp1, <2 x double> %temp4, <2 x double> %temp3)
+ ret <2 x double> %temp5
+}
+
+define <2 x float> @fmls_commuted_neg_2s(ptr %A, ptr %B, ptr %C) nounwind {
+; CHECK-LABEL: define <2 x float> @fmls_commuted_neg_2s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr [[C]], align 8
+; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x float> splat (float -0.000000e+00), [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP1]], <2 x float> [[TMP3]])
+; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x float> [[TMP5]]
+;
+ %temp1 = load <2 x float>, ptr %A
+ %temp2 = load <2 x float>, ptr %B
+ %temp3 = load <2 x float>, ptr %C
+ %temp4 = fsub <2 x float> <float -0.0, float -0.0>, %temp2
+ %temp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %temp4, <2 x float> %temp1, <2 x float> %temp3)
+ ret <2 x float> %temp5
+}
+
+define <4 x float> @fmls_commuted_neg_4s(ptr %A, ptr %B, ptr %C) nounwind {
+; CHECK-LABEL: define <4 x float> @fmls_commuted_neg_4s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[A]], align 16
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B]], align 16
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[C]], align 16
+; CHECK-NEXT: [[TMP4:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x float> [[TMP3]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[TMP5]]
+;
+ %temp1 = load <4 x float>, ptr %A
+ %temp2 = load <4 x float>, ptr %B
+ %temp3 = load <4 x float>, ptr %C
+ %temp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %temp2
+ %temp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %temp4, <4 x float> %temp1, <4 x float> %temp3)
+ ret <4 x float> %temp5
+}
+
+define <2 x double> @fmls_commuted_neg_2d(ptr %A, ptr %B, ptr %C) nounwind {
+; CHECK-LABEL: define <2 x double> @fmls_commuted_neg_2d(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[A]], align 16
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[B]], align 16
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[C]], align 16
+; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP1]], <2 x double> [[TMP3]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[TMP5]]
+;
+ %temp1 = load <2 x double>, ptr %A
+ %temp2 = load <2 x double>, ptr %B
+ %temp3 = load <2 x double>, ptr %C
+ %temp4 = fsub <2 x double> <double -0.0, double -0.0>, %temp2
+ %temp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %temp4, <2 x double> %temp1, <2 x double> %temp3)
+ ret <2 x double> %temp5
+}
+
+define <2 x float> @fmls_indexed_2s(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp {
+; CHECK-LABEL: define <2 x float> @fmls_indexed_2s(
+; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]]) #[[ATTR3:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP0:%.*]] = fsub <2 x float> splat (float -0.000000e+00), [[C]]
+; CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[B]], <2 x float> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[FMLS1:%.*]] = tail call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP0]], <2 x float> [[LANE]], <2 x float> [[A]])
+; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x float> [[FMLS1]]
+;
+entry:
+ %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %c
+ %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer
+ %fmls1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a)
+ ret <2 x float> %fmls1
+}
+
+define <4 x float> @fmls_indexed_4s(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp {
+; CHECK-LABEL: define <4 x float> @fmls_indexed_4s(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP0:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[C]]
+; CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[FMLS1:%.*]] = tail call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP0]], <4 x float> [[LANE]], <4 x float> [[A]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[FMLS1]]
+;
+entry:
+ %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+ %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
+ %fmls1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a)
+ ret <4 x float> %fmls1
+}
+
+define <2 x double> @fmls_indexed_2d(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp {
+; CHECK-LABEL: define <2 x double> @fmls_indexed_2d(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP0:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[C]]
+; CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[B]], <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[FMLS1:%.*]] = tail call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP0]], <2 x double> [[LANE]], <2 x double> [[A]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[FMLS1]]
+;
+entry:
+ %0 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
+ %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer
+ %fmls1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a)
+ ret <2 x double> %fmls1
+}
+
+define <2 x float> @fmla_indexed_scalar_2s(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp {
+; CHECK-LABEL: define <2 x float> @fmla_indexed_scalar_2s(
+; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], float [[C:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[V1:%.*]] = insertelement <2 x float> undef, float [[C]], i32 0
+; CHECK-NEXT: [[V2:%.*]] = insertelement <2 x float> [[V1]], float [[C]], i32 1
+; CHECK-NEXT: [[FMLA1:%.*]] = tail call <2 x float> @llvm.fma.v2f32(<2 x float> [[V1]], <2 x float> [[B]], <2 x float> [[A]]) #[[ATTR0]]
+; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x float> [[FMLA1]]
+;
+entry:
+ %v1 = insertelement <2 x float> undef, float %c, i32 0
+ %v2 = insertelement <2 x float> %v1, float %c, i32 1
+ %fmla1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %v1, <2 x float> %b, <2 x float> %a) nounwind
+ ret <2 x float> %fmla1
+}
+
+define <4 x float> @fmla_indexed_scalar_4s(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp {
+; CHECK-LABEL: define <4 x float> @fmla_indexed_scalar_4s(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], float [[C:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[V1:%.*]] = insertelement <4 x float> undef, float [[C]], i32 0
+; CHECK-NEXT: [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[C]], i32 1
+; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[C]], i32 2
+; CHECK-NEXT: [[V4:%.*]] = insertelement <4 x float> [[V3]], float [[C]], i32 3
+; CHECK-NEXT: [[FMLA1:%.*]] = tail call <4 x float> @llvm.fma.v4f32(<4 x float> [[V4]], <4 x float> [[B]], <4 x float> [[A]]) #[[ATTR0]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[FMLA1]]
+;
+entry:
+ %v1 = insertelement <4 x float> undef, float %c, i32 0
+ %v2 = insertelement <4 x float> %v1, float %c, i32 1
+ %v3 = insertelement <4 x float> %v2, float %c, i32 2
+ %v4 = insertelement <4 x float> %v3, float %c, i32 3
+ %fmla1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a) nounwind
+ ret <4 x float> %fmla1
+}
+
+define <2 x double> @fmla_indexed_scalar_2d(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: define <2 x double> @fmla_indexed_scalar_2d(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], double [[C:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[V1:%.*]] = insertelement <2 x double> undef, double [[C]], i32 0
+; CHECK-NEXT: [[V2:%.*]] = insertelement <2 x double> [[V1]], double [[C]], i32 1
+; CHECK-NEXT: [[FMLA1:%.*]] = tail call <2 x double> @llvm.fma.v2f64(<2 x double> [[V2]], <2 x double> [[B]], <2 x double> [[A]]) #[[ATTR0]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[FMLA1]]
+;
+entry:
+ %v1 = insertelement <2 x double> undef, double %c, i32 0
+ %v2 = insertelement <2 x double> %v1, double %c, i32 1
+ %fmla1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a) nounwind
+ ret <2 x double> %fmla1
+}
+
+define <2 x float> @fmls_indexed_2s_strict(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp strictfp {
+; CHECK-LABEL: define <2 x float> @fmls_indexed_2s_strict(
+; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]]) #[[ATTR4:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP0:%.*]] = fneg <2 x float> [[C]]
+; CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[B]], <2 x float> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[FMLS1:%.*]] = tail call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> [[TMP0]], <2 x float> [[LANE]], <2 x float> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR7:[0-9]+]]
+; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x float> [[FMLS1]]
+;
+entry:
+ %0 = fneg <2 x float> %c
+ %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer
+ %fmls1 = tail call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+ ret <2 x float> %fmls1
+}
+
+define <4 x float> @fmls_indexed_4s_strict(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp strictfp {
+; CHECK-LABEL: define <4 x float> @fmls_indexed_4s_strict(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP0:%.*]] = fneg <4 x float> [[C]]
+; CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[FMLS1:%.*]] = tail call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> [[TMP0]], <4 x float> [[LANE]], <4 x float> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR7]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[FMLS1]]
+;
+entry:
+ %0 = fneg <4 x float> %c
+ %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
+ %fmls1 = tail call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+ ret <4 x float> %fmls1
+}
+
+define <2 x double> @fmls_indexed_2d_strict(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp strictfp {
+; CHECK-LABEL: define <2 x double> @fmls_indexed_2d_strict(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP0:%.*]] = fneg <2 x double> [[C]]
+; CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[B]], <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[FMLS1:%.*]] = tail call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> [[TMP0]], <2 x double> [[LANE]], <2 x double> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR7]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[FMLS1]]
+;
+entry:
+ %0 = fneg <2 x double> %c
+ %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer
+ %fmls1 = tail call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+ ret <2 x double> %fmls1
+}
+
+define <2 x float> @fmla_indexed_scalar_2s_strict(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp strictfp {
+; CHECK-LABEL: define <2 x float> @fmla_indexed_scalar_2s_strict(
+; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], float [[C:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[V1:%.*]] = insertelement <2 x float> undef, float [[C]], i32 0
+; CHECK-NEXT: [[V2:%.*]] = insertelement <2 x float> [[V1]], float [[C]], i32 1
+; CHECK-NEXT: [[FMLA1:%.*]] = tail call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> [[V2]], <2 x float> [[B]], <2 x float> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR7]]
+; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x float> [[FMLA1]]
+;
+entry:
+ %v1 = insertelement <2 x float> undef, float %c, i32 0
+ %v2 = insertelement <2 x float> %v1, float %c, i32 1
+ %fmla1 = tail call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %v2, <2 x float> %b, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+ ret <2 x float> %fmla1
+}
+
+define <4 x float> @fmla_indexed_scalar_4s_strict(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp strictfp {
+; CHECK-LABEL: define <4 x float> @fmla_indexed_scalar_4s_strict(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], float [[C:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[V1:%.*]] = insertelement <4 x float> undef, float [[C]], i32 0
+; CHECK-NEXT: [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[C]], i32 1
+; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[C]], i32 2
+; CHECK-NEXT: [[V4:%.*]] = insertelement <4 x float> [[V3]], float [[C]], i32 3
+; CHECK-NEXT: [[FMLA1:%.*]] = tail call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> [[V4]], <4 x float> [[B]], <4 x float> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR7]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[FMLA1]]
+;
+entry:
+ %v1 = insertelement <4 x float> undef, float %c, i32 0
+ %v2 = insertelement <4 x float> %v1, float %c, i32 1
+ %v3 = insertelement <4 x float> %v2, float %c, i32 2
+ %v4 = insertelement <4 x float> %v3, float %c, i32 3
+ %fmla1 = tail call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+ ret <4 x float> %fmla1
+}
+
+define <2 x double> @fmla_indexed_scalar_2d_strict(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp strictfp {
+; CHECK-LABEL: define <2 x double> @fmla_indexed_scalar_2d_strict(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], double [[C:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[V1:%.*]] = insertelement <2 x double> undef, double [[C]], i32 0
+; CHECK-NEXT: [[V2:%.*]] = insertelement <2 x double> [[V1]], double [[C]], i32 1
+; CHECK-NEXT: [[FMLA1:%.*]] = tail call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> [[V2]], <2 x double> [[B]], <2 x double> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR7]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[FMLA1]]
+;
+entry:
+ %v1 = insertelement <2 x double> undef, double %c, i32 0
+ %v2 = insertelement <2 x double> %v1, double %c, i32 1
+ %fmla1 = tail call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+ ret <2 x double> %fmla1
+}
+
+attributes #0 = { strictfp }
+
+declare <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float>, <2 x float>, <2 x float>, metadata, metadata)
+declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata)
+
+define <4 x i16> @mul_4h(<4 x i16> %A, <4 x i16> %B) nounwind {
+; CHECK-LABEL: define <4 x i16> @mul_4h(
+; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i16> [[A]], [[TMP3]]
+; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i16> [[TMP4]]
+;
+ %temp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %temp4 = mul <4 x i16> %A, %temp3
+ ret <4 x i16> %temp4
+}
+
+define <8 x i16> @mul_8h(<8 x i16> %A, <8 x i16> %B) nounwind {
+; CHECK-LABEL: define <8 x i16> @mul_8h(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = mul <8 x i16> [[A]], [[TMP3]]
+; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i16> [[TMP4]]
+;
+ %temp3 = shufflevector <8 x i16> %B, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %temp4 = mul <8 x i16> %A, %temp3
+ ret <8 x i16> %temp4
+}
+
+define <2 x i32> @mul_2s(<2 x i32> %A, <2 x i32> %B) nounwind {
+; CHECK-LABEL: define <2 x i32> @mul_2s(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i32> [[A]], [[TMP3]]
+; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i32> [[TMP4]]
+;
+ %temp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+ %temp4 = mul <2 x i32> %A, %temp3
+ ret <2 x i32> %temp4
+}
+
+define <4 x i32> @mul_4s(<4 x i32> %A, <4 x i32> %B) nounwind {
+; CHECK-LABEL: define <4 x i32> @mul_4s(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[A]], [[TMP3]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP4]]
+;
+ %temp3 = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %temp4 = mul <4 x i32> %A, %temp3
+ ret <4 x i32> %temp4
+}
+
+define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind {
+; CHECK-LABEL: define <2 x i64> @mul_2d(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = mul <2 x i64> [[A]], [[B]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[TMP1]]
+;
+ %temp1 = mul <2 x i64> %A, %B
+ ret <2 x i64> %temp1
+}
+
+define <2 x float> @fmul_lane_2s(<2 x float> %A, <2 x float> %B) nounwind {
+; CHECK-LABEL: define <2 x float> @fmul_lane_2s(
+; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[B]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[A]], [[TMP3]]
+; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x float> [[TMP4]]
+;
+ %temp3 = shufflevector <2 x float> %B, <2 x float> poison, <2 x i32> <i32 1, i32 1>
+ %temp4 = fmul <2 x float> %A, %temp3
+ ret <2 x float> %temp4
+}
+
+define <4 x float> @fmul_lane_4s(<4 x float> %A, <4 x float> %B) nounwind {
+; CHECK-LABEL: define <4 x float> @fmul_lane_4s(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = fmul <4 x float> [[A]], [[TMP3]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[TMP4]]
+;
+ %temp3 = shufflevector <4 x float> %B, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %temp4 = fmul <4 x float> %A, %temp3
+ ret <4 x float> %temp4
+}
+
+define <2 x double> @fmul_lane_2d(<2 x double> %A, <2 x double> %B) nounwind {
+; CHECK-LABEL: define <2 x double> @fmul_lane_2d(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[B]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[A]], [[TMP3]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[TMP4]]
+;
+ %temp3 = shufflevector <2 x double> %B, <2 x double> poison, <2 x i32> <i32 1, i32 1>
+ %temp4 = fmul <2 x double> %A, %temp3
+ ret <2 x double> %temp4
+}
+
+define float @fmul_lane_s(float %A, <4 x float> %vec) nounwind {
+; CHECK-LABEL: define float @fmul_lane_s(
+; CHECK-SAME: float [[A:%.*]], <4 x float> [[VEC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[B:%.*]] = extractelement <4 x float> [[VEC]], i32 3
+; CHECK-NEXT: [[RES:%.*]] = fmul float [[A]], [[B]]
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret float [[RES]]
+;
+ %B = extractelement <4 x float> %vec, i32 3
+ %res = fmul float %A, %B
+ ret float %res
+}
+
+define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind {
+; CHECK-LABEL: define double @fmul_lane_d(
+; CHECK-SAME: double [[A:%.*]], <2 x double> [[VEC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[B:%.*]] = extractelement <2 x double> [[VEC]], i32 1
+; CHECK-NEXT: [[RES:%.*]] = fmul double [[A]], [[B]]
+; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret double [[RES]]
+;
+ %B = extractelement <2 x double> %vec, i32 1
+ %res = fmul double %A, %B
+ ret double %res
+}
+
+
+
+define <2 x float> @fmulx_lane_2s(<2 x float> %A, <2 x float> %B) nounwind {
+; CHECK-LABEL: define <2 x float> @fmulx_lane_2s(
+; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[B]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[TMP3]])
+; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x float> [[TMP4]]
+;
+ %temp3 = shufflevector <2 x float> %B, <2 x float> poison, <2 x i32> <i32 1, i32 1>
+ %temp4 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %A, <2 x float> %temp3)
+ ret <2 x float> %temp4
+}
+
+define <4 x float> @fmulx_lane_4s(<4 x float> %A, <4 x float> %B) nounwind {
+; CHECK-LABEL: define <4 x float> @fmulx_lane_4s(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[TMP3]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[TMP4]]
+;
+ %temp3 = shufflevector <4 x float> %B, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %temp4 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %A, <4 x float> %temp3)
+ ret <4 x float> %temp4
+}
+
+define <2 x double> @fmulx_lane_2d(<2 x double> %A, <2 x double> %B) nounwind {
+; CHECK-LABEL: define <2 x double> @fmulx_lane_2d(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[B]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[TMP3]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[TMP4]]
+;
+ %temp3 = shufflevector <2 x double> %B, <2 x double> poison, <2 x i32> <i32 1, i32 1>
+ %temp4 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %A, <2 x double> %temp3)
+ ret <2 x double> %temp4
+}
+
+define <4 x i16> @sqdmulh_lane_4h(<4 x i16> %A, <4 x i16> %B) nounwind {
+; CHECK-LABEL: define <4 x i16> @sqdmulh_lane_4h(
+; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[TMP3]])
+; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i16> [[TMP4]]
+;
+ %temp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %temp4 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %A, <4 x i16> %temp3)
+ ret <4 x i16> %temp4
+}
+
+define <8 x i16> @sqdmulh_lane_8h(<8 x i16> %A, <8 x i16> %B) nounwind {
+; CHECK-LABEL: define <8 x i16> @sqdmulh_lane_8h(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[TMP3]])
+; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i16> [[TMP4]]
+;
+ %temp3 = shufflevector <8 x i16> %B, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %temp4 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %A, <8 x i16> %temp3)
+ ret <8 x i16> %temp4
+}
+
+define <2 x i32> @sqdmulh_lane_2s(<2 x i32> %A, <2 x i32> %B) nounwind {
+; CHECK-LABEL: define <2 x i32> @sqdmulh_lane_2s(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[TMP3]])
+; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i32> [[TMP4]]
+;
+ %temp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+ %temp4 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %A, <2 x i32> %temp3)
+ ret <2 x i32> %temp4
+}
+
+define <4 x i32> @sqdmulh_lane_4s(<4 x i32> %A, <4 x i32> %B) nounwind {
+; CHECK-LABEL: define <4 x i32> @sqdmulh_lane_4s(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[TMP3]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP4]]
+;
+ %temp3 = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %temp4 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %A, <4 x i32> %temp3)
+ ret <4 x i32> %temp4
+}
+
+define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
+; CHECK-LABEL: define i32 @sqdmulh_lane_1s(
+; CHECK-SAME: i32 [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[B]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 [[A]], i32 [[TMP1]])
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[TMP2]]
+;
+ %temp1 = extractelement <4 x i32> %B, i32 1
+ %temp2 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %A, i32 %temp1)
+ ret i32 %temp2
+}
+
+define <4 x i16> @sqrdmulh_lane_4h(<4 x i16> %A, <4 x i16> %B) nounwind {
+; CHECK-LABEL: define <4 x i16> @sqrdmulh_lane_4h(
+; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[TMP3]])
+; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i16> [[TMP4]]
+;
+ %temp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %temp4 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %A, <4 x i16> %temp3)
+ ret <4 x i16> %temp4
+}
+
+define <8 x i16> @sqrdmulh_lane_8h(<8 x i16> %A, <8 x i16> %B) nounwind {
+; CHECK-LABEL: define <8 x i16> @sqrdmulh_lane_8h(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[TMP3]])
+; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i16> [[TMP4]]
+;
+ %temp3 = shufflevector <8 x i16> %B, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %temp4 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %A, <8 x i16> %temp3)
+ ret <8 x i16> %temp4
+}
+
+define <2 x i32> @sqrdmulh_lane_2s(<2 x i32> %A, <2 x i32> %B) nounwind {
+; CHECK-LABEL: define <2 x i32> @sqrdmulh_lane_2s(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[TMP3]])
+; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i32> [[TMP4]]
+;
+ %temp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+ %temp4 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %A, <2 x i32> %temp3)
+ ret <2 x i32> %temp4
+}
+
+define <4 x i32> @sqrdmulh_lane_4s(<4 x i32> %A, <4 x i32> %B) nounwind {
+; CHECK-LABEL: define <4 x i32> @sqrdmulh_lane_4s(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[TMP3]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP4]]
+;
+ %temp3 = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %temp4 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %A, <4 x i32> %temp3)
+ ret <4 x i32> %temp4
+}
+
+define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
+; CHECK-LABEL: define i32 @sqrdmulh_lane_1s(
+; CHECK-SAME: i32 [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[B]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[A]], i32 [[TMP1]])
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[TMP2]]
+;
+ %temp1 = extractelement <4 x i32> %B, i32 1
+ %temp2 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %A, i32 %temp1)
+ ret i32 %temp2
+}
+
+define <4 x i32> @sqdmull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind {
+; CHECK-LABEL: define <4 x i32> @sqdmull_lane_4s(
+; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[TMP3]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP4]]
+;
+ %temp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %temp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %A, <4 x i16> %temp3)
+ ret <4 x i32> %temp4
+}
+
+define <2 x i64> @sqdmull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind {
+; CHECK-LABEL: define <2 x i64> @sqdmull_lane_2d(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[TMP3]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[TMP4]]
+;
+ %temp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+ %temp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %A, <2 x i32> %temp3)
+ ret <2 x i64> %temp4
+}
+
+define <4 x i32> @sqdmull2_lane_4s(<8 x i16> %A, <8 x i16> %B) nounwind {
+; CHECK-LABEL: define <4 x i32> @sqdmull2_lane_4s(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP4]]
+;
+ %temp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %temp2 = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %temp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
+ ret <4 x i32> %temp4
+}
+
+define <2 x i64> @sqdmull2_lane_2d(<4 x i32> %A, <4 x i32> %B) nounwind {
+; CHECK-LABEL: define <2 x i64> @sqdmull2_lane_2d(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[TMP4]]
+;
+ %temp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %temp2 = shufflevector <4 x i32> %B, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+ %temp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
+ ret <2 x i64> %temp4
+}
+
+define <4 x i32> @umull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind {
+; CHECK-LABEL: define <4 x i32> @umull_lane_4s(
+; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[TMP3]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP4]]
+;
+ %temp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %temp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %A, <4 x i16> %temp3)
+ ret <4 x i32> %temp4
+}
+
+define <2 x i64> @umull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind {
+; CHECK-LABEL: define <2 x i64> @umull_lane_2d(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[TMP3]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[TMP4]]
+;
+ %temp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+ %temp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %temp3)
+ ret <2 x i64> %temp4
+}
+
+define <4 x i32> @smull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind {
+; CHECK-LABEL: define <4 x i32> @smull_lane_4s(
+; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[TMP3]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP4]]
+;
+ %temp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %temp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %A, <4 x i16> %temp3)
+ ret <4 x i32> %temp4
+}
+
+define <2 x i64> @smull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind {
+; CHECK-LABEL: define <2 x i64> @smull_lane_2d(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[TMP3]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[TMP4]]
+;
+ %temp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+ %temp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %temp3)
+ ret <2 x i64> %temp4
+}
+
+define <4 x i32> @smlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
+; CHECK-LABEL: define <4 x i32> @smlal_lane_4s(
+; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[C]], [[TMP5]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP6]]
+;
+ %temp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %temp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %A, <4 x i16> %temp4)
+ %temp6 = add <4 x i32> %C, %temp5
+ ret <4 x i32> %temp6
+}
+
+define <2 x i64> @smlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
+; CHECK-LABEL: define <2 x i64> @smlal_lane_2d(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i64> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[C]], [[TMP5]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[TMP6]]
+;
+ %temp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+ %temp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %temp4)
+ %temp6 = add <2 x i64> %C, %temp5
+ ret <2 x i64> %temp6
+}
+
+define <4 x i32> @sqdmlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
+; CHECK-LABEL: define <4 x i32> @sqdmlal_lane_4s(
+; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[C]], <4 x i32> [[TMP5]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP6]]
+;
+ %temp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %temp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %A, <4 x i16> %temp4)
+ %temp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %C, <4 x i32> %temp5)
+ ret <4 x i32> %temp6
+}
+
+define <2 x i64> @sqdmlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
+; CHECK-LABEL: define <2 x i64> @sqdmlal_lane_2d(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i64> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[C]], <2 x i64> [[TMP5]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[TMP6]]
+;
+ %temp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+ %temp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %A, <2 x i32> %temp4)
+ %temp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %C, <2 x i64> %temp5)
+ ret <2 x i64> %temp6
+}
+
+define <4 x i32> @sqdmlal2_lane_4s(<8 x i16> %A, <8 x i16> %B, <4 x i32> %C) nounwind {
+; CHECK-LABEL: define <4 x i32> @sqdmlal2_lane_4s(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
+; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[C]], <4 x i32> [[TMP5]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP6]]
+;
+ %temp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %temp2 = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %temp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
+ %temp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %C, <4 x i32> %temp5)
+ ret <4 x i32> %temp6
+}
+
+define <2 x i64> @sqdmlal2_lane_2d(<4 x i32> %A, <4 x i32> %B, <2 x i64> %C) nounwind {
+; CHECK-LABEL: define <2 x i64> @sqdmlal2_lane_2d(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <2 x i64> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
+; CHECK-NEXT: [[TMP6:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[C]], <2 x i64> [[TMP5]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[TMP6]]
+;
+ %temp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %temp2 = shufflevector <4 x i32> %B, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+ %temp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
+ %temp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %C, <2 x i64> %temp5)
+ ret <2 x i64> %temp6
+}
+
+define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
+; CHECK-LABEL: define i32 @sqdmlal_lane_1s(
+; CHECK-SAME: i32 [[A:%.*]], i16 [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[LHS:%.*]] = insertelement <4 x i16> undef, i16 [[B]], i32 0
+; CHECK-NEXT: [[RHS:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[PROD_VEC:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[LHS]], <4 x i16> [[RHS]])
+; CHECK-NEXT: [[PROD:%.*]] = extractelement <4 x i32> [[PROD_VEC]], i32 0
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[A]], i32 [[PROD]])
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[RES]]
+;
+ %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
+ %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
+ %prod = extractelement <4 x i32> %prod.vec, i32 0
+ %res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod)
+ ret i32 %res
+}
+declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
+
+define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
+; CHECK-LABEL: define i32 @sqdmlsl_lane_1s(
+; CHECK-SAME: i32 [[A:%.*]], i16 [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[LHS:%.*]] = insertelement <4 x i16> undef, i16 [[B]], i32 0
+; CHECK-NEXT: [[RHS:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[PROD_VEC:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[LHS]], <4 x i16> [[RHS]])
+; CHECK-NEXT: [[PROD:%.*]] = extractelement <4 x i32> [[PROD_VEC]], i32 0
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[A]], i32 [[PROD]])
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[RES]]
+;
+ %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
+ %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
+ %prod = extractelement <4 x i32> %prod.vec, i32 0
+ %res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod)
+ ret i32 %res
+}
+declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
+
+define i32 @sqadd_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind {
+; CHECK-LABEL: define i32 @sqadd_lane1_sqdmull4s(
+; CHECK-SAME: i32 [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[PROD_VEC:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[C]])
+; CHECK-NEXT: [[PROD:%.*]] = extractelement <4 x i32> [[PROD_VEC]], i32 1
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[A]], i32 [[PROD]])
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[RES]]
+;
+ %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %B, <4 x i16> %C)
+ %prod = extractelement <4 x i32> %prod.vec, i32 1
+ %res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod)
+ ret i32 %res
+}
+
+define i32 @sqsub_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind {
+; CHECK-LABEL: define i32 @sqsub_lane1_sqdmull4s(
+; CHECK-SAME: i32 [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[PROD_VEC:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[C]])
+; CHECK-NEXT: [[PROD:%.*]] = extractelement <4 x i32> [[PROD_VEC]], i32 1
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[A]], i32 [[PROD]])
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[RES]]
+;
+ %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %B, <4 x i16> %C)
+ %prod = extractelement <4 x i32> %prod.vec, i32 1
+ %res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod)
+ ret i32 %res
+}
+
+define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
+; CHECK-LABEL: define i64 @sqdmlal_lane_1d(
+; CHECK-SAME: i64 [[A:%.*]], i32 [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[RHS:%.*]] = extractelement <2 x i32> [[C]], i32 1
+; CHECK-NEXT: [[PROD:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[B]], i32 [[RHS]])
+; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 [[A]], i64 [[PROD]])
+; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i64 [[RES]]
+;
+ %rhs = extractelement <2 x i32> %C, i32 1
+ %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
+ %res = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %A, i64 %prod)
+ ret i64 %res
+}
+declare i64 @llvm.aarch64.neon.sqdmulls.scalar(i32, i32)
+declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64)
+
+define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
+; CHECK-LABEL: define i64 @sqdmlsl_lane_1d(
+; CHECK-SAME: i64 [[A:%.*]], i32 [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[RHS:%.*]] = extractelement <2 x i32> [[C]], i32 1
+; CHECK-NEXT: [[PROD:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[B]], i32 [[RHS]])
+; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 [[A]], i64 [[PROD]])
+; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i64 [[RES]]
+;
+ %rhs = extractelement <2 x i32> %C, i32 1
+ %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
+ %res = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %A, i64 %prod)
+ ret i64 %res
+}
+declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64)
+
+
+define <4 x i32> @umlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
+; CHECK-LABEL: define <4 x i32> @umlal_lane_4s(
+; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[C]], [[TMP5]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP6]]
+;
+ %temp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %temp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %A, <4 x i16> %temp4)
+ %temp6 = add <4 x i32> %C, %temp5
+ ret <4 x i32> %temp6
+}
+
+define <2 x i64> @umlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
+; CHECK-LABEL: define <2 x i64> @umlal_lane_2d(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i64> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[C]], [[TMP5]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[TMP6]]
+;
+ %temp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+ %temp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %temp4)
+ %temp6 = add <2 x i64> %C, %temp5
+ ret <2 x i64> %temp6
+}
+
+
+define <4 x i32> @smlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
+; CHECK-LABEL: define <4 x i32> @smlsl_lane_4s(
+; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[C]], [[TMP5]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP6]]
+;
+ %temp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %temp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %A, <4 x i16> %temp4)
+ %temp6 = sub <4 x i32> %C, %temp5
+ ret <4 x i32> %temp6
+}
+
+define <2 x i64> @smlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
+; CHECK-LABEL: define <2 x i64> @smlsl_lane_2d(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i64> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[C]], [[TMP5]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[TMP6]]
+;
+ %temp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+ %temp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %temp4)
+ %temp6 = sub <2 x i64> %C, %temp5
+ ret <2 x i64> %temp6
+}
+
+define <4 x i32> @sqdmlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
+; CHECK-LABEL: define <4 x i32> @sqdmlsl_lane_4s(
+; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[C]], <4 x i32> [[TMP5]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP6]]
+;
+ %temp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %temp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %A, <4 x i16> %temp4)
+ %temp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %C, <4 x i32> %temp5)
+ ret <4 x i32> %temp6
+}
+
+define <2 x i64> @sqdmlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
+; CHECK-LABEL: define <2 x i64> @sqdmlsl_lane_2d(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i64> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[C]], <2 x i64> [[TMP5]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[TMP6]]
+;
+ %temp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+ %temp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %A, <2 x i32> %temp4)
+ %temp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %C, <2 x i64> %temp5)
+ ret <2 x i64> %temp6
+}
+
+define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16> %A, <8 x i16> %B, <4 x i32> %C) nounwind {
+; CHECK-LABEL: define <4 x i32> @sqdmlsl2_lane_4s(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
+; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[C]], <4 x i32> [[TMP5]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP6]]
+;
+ %temp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %temp2 = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %temp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
+ %temp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %C, <4 x i32> %temp5)
+ ret <4 x i32> %temp6
+}
+
+define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32> %A, <4 x i32> %B, <2 x i64> %C) nounwind {
+; CHECK-LABEL: define <2 x i64> @sqdmlsl2_lane_2d(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <2 x i64> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
+; CHECK-NEXT: [[TMP6:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[C]], <2 x i64> [[TMP5]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[TMP6]]
+;
+ %temp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %temp2 = shufflevector <4 x i32> %B, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+ %temp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
+ %temp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %C, <2 x i64> %temp5)
+ ret <2 x i64> %temp6
+}
+
+define <4 x i32> @umlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
+; CHECK-LABEL: define <4 x i32> @umlsl_lane_4s(
+; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[C]], [[TMP5]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TMP6]]
+;
+ %temp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %temp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %A, <4 x i16> %temp4)
+ %temp6 = sub <4 x i32> %C, %temp5
+ ret <4 x i32> %temp6
+}
+
+define <2 x i64> @umlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
+; CHECK-LABEL: define <2 x i64> @umlsl_lane_2d(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i64> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[C]], [[TMP5]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[TMP6]]
+;
+ %temp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+ %temp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %temp4)
+ %temp6 = sub <2 x i64> %C, %temp5
+ ret <2 x i64> %temp6
+}
+
+; Scalar FMULX
+define float @fmulxs(float %a, float %b) nounwind {
+; CHECK-LABEL: define float @fmulxs(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[FMULX_I:%.*]] = tail call float @llvm.aarch64.neon.fmulx.f32(float [[A]], float [[B]]) #[[ATTR0]]
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret float [[FMULX_I]]
+;
+ %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
+ ret float %fmulx.i
+}
+
+define double @fmulxd(double %a, double %b) nounwind {
+; CHECK-LABEL: define double @fmulxd(
+; CHECK-SAME: double [[A:%.*]], double [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[FMULX_I:%.*]] = tail call double @llvm.aarch64.neon.fmulx.f64(double [[A]], double [[B]]) #[[ATTR0]]
+; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret double [[FMULX_I]]
+;
+ %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
+ ret double %fmulx.i
+}
+
+define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind {
+; CHECK-LABEL: define float @fmulxs_lane(
+; CHECK-SAME: float [[A:%.*]], <4 x float> [[VEC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[B:%.*]] = extractelement <4 x float> [[VEC]], i32 3
+; CHECK-NEXT: [[FMULX_I:%.*]] = tail call float @llvm.aarch64.neon.fmulx.f32(float [[A]], float [[B]]) #[[ATTR0]]
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret float [[FMULX_I]]
+;
+ %b = extractelement <4 x float> %vec, i32 3
+ %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
+ ret float %fmulx.i
+}
+
+define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind {
+; CHECK-LABEL: define double @fmulxd_lane(
+; CHECK-SAME: double [[A:%.*]], <2 x double> [[VEC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[B:%.*]] = extractelement <2 x double> [[VEC]], i32 1
+; CHECK-NEXT: [[FMULX_I:%.*]] = tail call double @llvm.aarch64.neon.fmulx.f64(double [[A]], double [[B]]) #[[ATTR0]]
+; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret double [[FMULX_I]]
+;
+ %b = extractelement <2 x double> %vec, i32 1
+ %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
+ ret double %fmulx.i
+}
+
+declare double @llvm.aarch64.neon.fmulx.f64(double, double) nounwind readnone
+declare float @llvm.aarch64.neon.fmulx.f32(float, float) nounwind readnone
+
+
+define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind {
+; CHECK-LABEL: define <8 x i16> @smull2_8h_simple(
+; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[TMP3:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
+; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i16> [[TMP3]]
+;
+ %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %3 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %1, <8 x i8> %2) #2
+ ret <8 x i16> %3
+}
+
+define <8 x i16> @foo0(<16 x i8> %a, <16 x i8> %b) nounwind {
+; CHECK-LABEL: define <8 x i16> @foo0(
+; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <8 x i8>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I3_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I]] to <8 x i8>
+; CHECK-NEXT: [[VMULL_I_I:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[TMP3]]) #[[ATTR0]]
+; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i16> [[VMULL_I_I]]
+;
+ %temp = bitcast <16 x i8> %a to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
+ %temp2 = bitcast <16 x i8> %b to <2 x i64>
+ %shuffle.i3.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
+ %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %temp1, <8 x i8> %temp3) nounwind
+ ret <8 x i16> %vmull.i.i
+}
+
+define <4 x i32> @foo1(<8 x i16> %a, <8 x i16> %b) nounwind {
+; CHECK-LABEL: define <4 x i32> @foo1(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <4 x i16>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I3_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I]] to <4 x i16>
+; CHECK-NEXT: [[VMULL2_I_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP3]]) #[[ATTR0]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]]
+;
+ %temp = bitcast <8 x i16> %a to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+ %temp2 = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i3.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
+ %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %temp1, <4 x i16> %temp3) nounwind
+ ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @foo2(<4 x i32> %a, <4 x i32> %b) nounwind {
+; CHECK-LABEL: define <2 x i64> @foo2(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <2 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I3_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I]] to <2 x i32>
+; CHECK-NEXT: [[VMULL2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP3]]) #[[ATTR0]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]]
+;
+ %temp = bitcast <4 x i32> %a to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+ %temp2 = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i3.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
+ %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %temp1, <2 x i32> %temp3) nounwind
+ ret <2 x i64> %vmull2.i.i
+}
+
+define <8 x i16> @foo3(<16 x i8> %a, <16 x i8> %b) nounwind {
+; CHECK-LABEL: define <8 x i16> @foo3(
+; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <8 x i8>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I3_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I]] to <8 x i8>
+; CHECK-NEXT: [[VMULL_I_I:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[TMP3]]) #[[ATTR0]]
+; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i16> [[VMULL_I_I]]
+;
+ %temp = bitcast <16 x i8> %a to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
+ %temp2 = bitcast <16 x i8> %b to <2 x i64>
+ %shuffle.i3.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
+ %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %temp1, <8 x i8> %temp3) nounwind
+ ret <8 x i16> %vmull.i.i
+}
+
+define <4 x i32> @foo4(<8 x i16> %a, <8 x i16> %b) nounwind {
+; CHECK-LABEL: define <4 x i32> @foo4(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <4 x i16>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I3_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I]] to <4 x i16>
+; CHECK-NEXT: [[VMULL2_I_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP3]]) #[[ATTR0]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]]
+;
+ %temp = bitcast <8 x i16> %a to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+ %temp2 = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i3.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
+ %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %temp1, <4 x i16> %temp3) nounwind
+ ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @foo5(<4 x i32> %a, <4 x i32> %b) nounwind {
+; CHECK-LABEL: define <2 x i64> @foo5(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <2 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I3_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I]] to <2 x i32>
+; CHECK-NEXT: [[VMULL2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP3]]) #[[ATTR0]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]]
+;
+ %temp = bitcast <4 x i32> %a to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+ %temp2 = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i3.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
+ %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %temp1, <2 x i32> %temp3) nounwind
+ ret <2 x i64> %vmull2.i.i
+}
+
+define <4 x i32> @foo6(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: define <4 x i32> @foo6(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR6:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I]] to <4 x i16>
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VMULL2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[SHUFFLE]]) #[[ATTR0]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
+;
+entry:
+ %0 = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+ %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+ %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+ ret <4 x i32> %vmull2.i
+}
+
+define <4 x i32> @foo6a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: define <4 x i32> @foo6a(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR6]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I]] to <4 x i16>
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VMULL2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[SHUFFLE]]) #[[ATTR0]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
+;
+entry:
+ %0 = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
+ %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+ %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+ ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: define <2 x i64> @foo7(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <4 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR6]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I]] to <2 x i32>
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT: [[VMULL2_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[SHUFFLE]]) #[[ATTR0]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
+;
+entry:
+ %0 = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+ %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
+ %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+ ret <2 x i64> %vmull2.i
+}
+
+define <2 x i64> @foo7a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: define <2 x i64> @foo7a(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <4 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR6]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I]] to <2 x i32>
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT: [[VMULL2_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[SHUFFLE]]) #[[ATTR0]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
+;
+entry:
+ %0 = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
+ %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
+ %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+ ret <2 x i64> %vmull2.i
+}
+
+
+define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: define <4 x i32> @foo8(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR6]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I]] to <4 x i16>
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VMULL2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[SHUFFLE]]) #[[ATTR0]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
+;
+entry:
+ %0 = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+ %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+ %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+ ret <4 x i32> %vmull2.i
+}
+
+define <4 x i32> @foo8a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: define <4 x i32> @foo8a(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR6]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I]] to <4 x i16>
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VMULL2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[SHUFFLE]]) #[[ATTR0]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
+;
+entry:
+ %0 = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
+ %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+ %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+ ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: define <2 x i64> @foo9(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <4 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR6]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I]] to <2 x i32>
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT: [[VMULL2_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[SHUFFLE]]) #[[ATTR0]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
+;
+entry:
+ %0 = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+ %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
+ %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+ ret <2 x i64> %vmull2.i
+}
+
+define <2 x i64> @foo9a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: define <2 x i64> @foo9a(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <4 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR6]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I]] to <2 x i32>
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT: [[VMULL2_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[SHUFFLE]]) #[[ATTR0]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
+;
+entry:
+ %0 = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
+ %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
+ %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+ ret <2 x i64> %vmull2.i
+}
+
+define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
+; CHECK-LABEL: define <8 x i16> @bar0(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP:%.*]] = bitcast <16 x i8> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I_I]] to <8 x i8>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[C]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I3_I_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I_I]] to <8 x i8>
+; CHECK-NEXT: [[VMULL_I_I_I:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[TMP3]]) #[[ATTR0]]
+; CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[VMULL_I_I_I]], [[A]]
+; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i16> [[ADD_I]]
+;
+ %temp = bitcast <16 x i8> %b to <2 x i64>
+ %shuffle.i.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
+ %temp2 = bitcast <16 x i8> %c to <2 x i64>
+ %shuffle.i3.i.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
+ %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %temp1, <8 x i8> %temp3) nounwind
+ %add.i = add <8 x i16> %vmull.i.i.i, %a
+ ret <8 x i16> %add.i
+}
+
+define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
+; CHECK-LABEL: define <4 x i32> @bar1(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP:%.*]] = bitcast <8 x i16> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I_I]] to <4 x i16>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[C]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I3_I_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I_I]] to <4 x i16>
+; CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP3]]) #[[ATTR0]]
+; CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[VMULL2_I_I_I]], [[A]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[ADD_I]]
+;
+ %temp = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
+ %temp2 = bitcast <8 x i16> %c to <2 x i64>
+ %shuffle.i3.i.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
+ %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %temp1, <4 x i16> %temp3) nounwind
+ %add.i = add <4 x i32> %vmull2.i.i.i, %a
+ ret <4 x i32> %add.i
+}
+
+define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
+; CHECK-LABEL: define <2 x i64> @bar2(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I_I]] to <2 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[C]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I3_I_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I_I]] to <2 x i32>
+; CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP3]]) #[[ATTR0]]
+; CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[VMULL2_I_I_I]], [[A]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[ADD_I]]
+;
+ %temp = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
+ %temp2 = bitcast <4 x i32> %c to <2 x i64>
+ %shuffle.i3.i.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
+ %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %temp1, <2 x i32> %temp3) nounwind
+ %add.i = add <2 x i64> %vmull2.i.i.i, %a
+ ret <2 x i64> %add.i
+}
+
+define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
+; CHECK-LABEL: define <8 x i16> @bar3(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP:%.*]] = bitcast <16 x i8> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I_I]] to <8 x i8>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[C]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I3_I_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I_I]] to <8 x i8>
+; CHECK-NEXT: [[VMULL_I_I_I:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[TMP3]]) #[[ATTR0]]
+; CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[VMULL_I_I_I]], [[A]]
+; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i16> [[ADD_I]]
+;
+ %temp = bitcast <16 x i8> %b to <2 x i64>
+ %shuffle.i.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
+ %temp2 = bitcast <16 x i8> %c to <2 x i64>
+ %shuffle.i3.i.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
+ %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %temp1, <8 x i8> %temp3) nounwind
+ %add.i = add <8 x i16> %vmull.i.i.i, %a
+ ret <8 x i16> %add.i
+}
+
+define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
+; CHECK-LABEL: define <4 x i32> @bar4(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP:%.*]] = bitcast <8 x i16> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I_I]] to <4 x i16>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[C]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I3_I_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I_I]] to <4 x i16>
+; CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP3]]) #[[ATTR0]]
+; CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[VMULL2_I_I_I]], [[A]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[ADD_I]]
+;
+ %temp = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
+ %temp2 = bitcast <8 x i16> %c to <2 x i64>
+ %shuffle.i3.i.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
+ %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %temp1, <4 x i16> %temp3) nounwind
+ %add.i = add <4 x i32> %vmull2.i.i.i, %a
+ ret <4 x i32> %add.i
+}
+
+define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
+; CHECK-LABEL: define <2 x i64> @bar5(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I_I]] to <2 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[C]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I3_I_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I_I]] to <2 x i32>
+; CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP3]]) #[[ATTR0]]
+; CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[VMULL2_I_I_I]], [[A]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[ADD_I]]
+;
+ %temp = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
+ %temp2 = bitcast <4 x i32> %c to <2 x i64>
+ %shuffle.i3.i.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
+ %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %temp1, <2 x i32> %temp3) nounwind
+ %add.i = add <2 x i64> %vmull2.i.i.i, %a
+ ret <2 x i64> %add.i
+}
+
+define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
+; CHECK-LABEL: define <4 x i32> @mlal2_1(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT: [[TMP:%.*]] = bitcast <8 x i16> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <4 x i16>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I3_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I]] to <4 x i16>
+; CHECK-NEXT: [[VMULL2_I_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP3]]) #[[ATTR0]]
+; CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[VMULL2_I_I]], [[A]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[ADD]]
+;
+ %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ %temp = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+ %temp2 = bitcast <8 x i16> %shuffle to <2 x i64>
+ %shuffle.i3.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
+ %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %temp1, <4 x i16> %temp3) nounwind
+ %add = add <4 x i32> %vmull2.i.i, %a
+ ret <4 x i32> %add
+}
+
+define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
+; CHECK-LABEL: define <2 x i64> @mlal2_2(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <4 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <2 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I3_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I]] to <2 x i32>
+; CHECK-NEXT: [[VMULL2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP3]]) #[[ATTR0]]
+; CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[VMULL2_I_I]], [[A]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[ADD]]
+;
+ %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %temp = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+ %temp2 = bitcast <4 x i32> %shuffle to <2 x i64>
+ %shuffle.i3.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
+ %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %temp1, <2 x i32> %temp3) nounwind
+ %add = add <2 x i64> %vmull2.i.i, %a
+ ret <2 x i64> %add
+}
+
+define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
+; CHECK-LABEL: define <4 x i32> @mlal2_4(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP:%.*]] = bitcast <8 x i16> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <4 x i16>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I3_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I]] to <4 x i16>
+; CHECK-NEXT: [[VMULL2_I_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP3]]) #[[ATTR0]]
+; CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[VMULL2_I_I]], [[A]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[ADD]]
+;
+ %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+ %temp = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+ %temp2 = bitcast <8 x i16> %shuffle to <2 x i64>
+ %shuffle.i3.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
+ %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %temp1, <4 x i16> %temp3) nounwind
+ %add = add <4 x i32> %vmull2.i.i, %a
+ ret <4 x i32> %add
+}
+
+define <2 x i64> @mlal2_5(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
+; CHECK-LABEL: define <2 x i64> @mlal2_5(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <4 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <2 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I3_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I]] to <2 x i32>
+; CHECK-NEXT: [[VMULL2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP3]]) #[[ATTR0]]
+; CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[VMULL2_I_I]], [[A]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[ADD]]
+;
+ %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> zeroinitializer
+ %temp = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+ %temp2 = bitcast <4 x i32> %shuffle to <2 x i64>
+ %shuffle.i3.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %temp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
+ %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %temp1, <2 x i32> %temp3) nounwind
+ %add = add <2 x i64> %vmull2.i.i, %a
+ ret <2 x i64> %add
+}
+
+; rdar://12328502
+define <2 x double> @vmulq_n_f64(<2 x double> %x, double %y) nounwind readnone ssp {
+; CHECK-LABEL: define <2 x double> @vmulq_n_f64(
+; CHECK-SAME: <2 x double> [[X:%.*]], double [[Y:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double [[Y]], i32 0
+; CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[Y]], i32 1
+; CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x double> [[VECINIT1_I]], [[X]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[MUL_I]]
+;
+entry:
+ %vecinit.i = insertelement <2 x double> undef, double %y, i32 0
+ %vecinit1.i = insertelement <2 x double> %vecinit.i, double %y, i32 1
+ %mul.i = fmul <2 x double> %vecinit1.i, %x
+ ret <2 x double> %mul.i
+}
+
+define <4 x float> @vmulq_n_f32(<4 x float> %x, float %y) nounwind readnone ssp {
+; CHECK-LABEL: define <4 x float> @vmulq_n_f32(
+; CHECK-SAME: <4 x float> [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Y]], i32 0
+; CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[Y]], i32 1
+; CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[Y]], i32 2
+; CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[Y]], i32 3
+; CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[VECINIT3_I]], [[X]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[MUL_I]]
+;
+entry:
+ %vecinit.i = insertelement <4 x float> undef, float %y, i32 0
+ %vecinit1.i = insertelement <4 x float> %vecinit.i, float %y, i32 1
+ %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %y, i32 2
+ %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %y, i32 3
+ %mul.i = fmul <4 x float> %vecinit3.i, %x
+ ret <4 x float> %mul.i
+}
+
+define <2 x float> @vmul_n_f32(<2 x float> %x, float %y) nounwind readnone ssp {
+; CHECK-LABEL: define <2 x float> @vmul_n_f32(
+; CHECK-SAME: <2 x float> [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float [[Y]], i32 0
+; CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[Y]], i32 1
+; CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[VECINIT1_I]], [[X]]
+; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x float> [[MUL_I]]
+;
+entry:
+ %vecinit.i = insertelement <2 x float> undef, float %y, i32 0
+ %vecinit1.i = insertelement <2 x float> %vecinit.i, float %y, i32 1
+ %mul.i = fmul <2 x float> %vecinit1.i, %x
+ ret <2 x float> %mul.i
+}
+
+define <4 x i16> @vmla_laneq_s16_test(<4 x i16> %a, <4 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
+; CHECK-LABEL: define <4 x i16> @vmla_laneq_s16_test(
+; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+; CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[SHUFFLE]], [[B]]
+; CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[MUL]], [[A]]
+; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i16> [[ADD]]
+;
+entry:
+ %shuffle = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+ %mul = mul <4 x i16> %shuffle, %b
+ %add = add <4 x i16> %mul, %a
+ ret <4 x i16> %add
+}
+
+define <2 x i32> @vmla_laneq_s32_test(<2 x i32> %a, <2 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
+; CHECK-LABEL: define <2 x i32> @vmla_laneq_s32_test(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+; CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[SHUFFLE]], [[B]]
+; CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[MUL]], [[A]]
+; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i32> [[ADD]]
+;
+entry:
+ %shuffle = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+ %mul = mul <2 x i32> %shuffle, %b
+ %add = add <2 x i32> %mul, %a
+ ret <2 x i32> %add
+}
+
+define <8 x i16> @not_really_vmlaq_laneq_s16_test(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
+; CHECK-LABEL: define <8 x i16> @not_really_vmlaq_laneq_s16_test(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x i16> [[SHUFFLE1]], <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[SHUFFLE2]], [[B]]
+; CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[MUL]], [[A]]
+; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i16> [[ADD]]
+;
+entry:
+ %shuffle1 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %shuffle2 = shufflevector <4 x i16> %shuffle1, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %mul = mul <8 x i16> %shuffle2, %b
+ %add = add <8 x i16> %mul, %a
+ ret <8 x i16> %add
+}
+
+define <4 x i32> @not_really_vmlaq_laneq_s32_test(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
+; CHECK-LABEL: define <4 x i32> @not_really_vmlaq_laneq_s32_test(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x i32> [[SHUFFLE1]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[SHUFFLE2]], [[B]]
+; CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[MUL]], [[A]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[ADD]]
+;
+entry:
+ %shuffle1 = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %shuffle2 = shufflevector <2 x i32> %shuffle1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %mul = mul <4 x i32> %shuffle2, %b
+ %add = add <4 x i32> %mul, %a
+ ret <4 x i32> %add
+}
+
+define <4 x i32> @vmull_laneq_s16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
+; CHECK-LABEL: define <4 x i32> @vmull_laneq_s16_test(
+; CHECK-SAME: <4 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+; CHECK-NEXT: [[VMULL2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
+;
+entry:
+ %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
+ ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @vmull_laneq_s32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
+; CHECK-LABEL: define <2 x i64> @vmull_laneq_s32_test(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <2 x i32> <i32 2, i32 2>
+; CHECK-NEXT: [[VMULL2_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
+;
+entry:
+ %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
+ ret <2 x i64> %vmull2.i
+}
+define <4 x i32> @vmull_laneq_u16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
+; CHECK-LABEL: define <4 x i32> @vmull_laneq_u16_test(
+; CHECK-SAME: <4 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+; CHECK-NEXT: [[VMULL2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
+;
+entry:
+ %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
+ ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @vmull_laneq_u32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
+; CHECK-LABEL: define <2 x i64> @vmull_laneq_u32_test(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <2 x i32> <i32 2, i32 2>
+; CHECK-NEXT: [[VMULL2_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
+;
+entry:
+ %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
+ ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @vmull_low_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
+; CHECK-LABEL: define <4 x i32> @vmull_low_n_s16_test(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], i32 [[D:%.*]]) #[[ATTR6]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[CONV:%.*]] = trunc i32 [[D]] to i16
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <4 x i16>
+; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[CONV]], i32 0
+; CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[CONV]], i32 1
+; CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[CONV]], i32 2
+; CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[CONV]], i32 3
+; CHECK-NEXT: [[VMULL2_I_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[VECINIT3_I]]) #[[ATTR0]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]]
+;
+entry:
+ %conv = trunc i32 %d to i16
+ %0 = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
+ %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+ %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
+ %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
+ %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
+ %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
+ %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
+ ret <4 x i32> %vmull2.i.i
+}
+
+define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
+; CHECK-LABEL: define <4 x i32> @vmull_high_n_s16_test(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], i32 [[D:%.*]]) #[[ATTR6]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[CONV:%.*]] = trunc i32 [[D]] to i16
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <4 x i16>
+; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[CONV]], i32 0
+; CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[CONV]], i32 1
+; CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[CONV]], i32 2
+; CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[CONV]], i32 3
+; CHECK-NEXT: [[VMULL2_I_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[VECINIT3_I]]) #[[ATTR0]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]]
+;
+entry:
+ %conv = trunc i32 %d to i16
+ %0 = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+ %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+ %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
+ %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
+ %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
+ %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
+ %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
+ ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @vmull_high_n_s32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
+; CHECK-LABEL: define <2 x i64> @vmull_high_n_s32_test(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <4 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], i32 [[D:%.*]]) #[[ATTR6]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <2 x i32>
+; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[D]], i32 0
+; CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[D]], i32 1
+; CHECK-NEXT: [[VMULL2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[VECINIT1_I]]) #[[ATTR0]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]]
+;
+entry:
+ %0 = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+ %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+ %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
+ %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
+ %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
+ ret <2 x i64> %vmull2.i.i
+}
+
+define <4 x i32> @vmull_high_n_u16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
+; CHECK-LABEL: define <4 x i32> @vmull_high_n_u16_test(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], i32 [[D:%.*]]) #[[ATTR6]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[CONV:%.*]] = trunc i32 [[D]] to i16
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <4 x i16>
+; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[CONV]], i32 0
+; CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[CONV]], i32 1
+; CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[CONV]], i32 2
+; CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[CONV]], i32 3
+; CHECK-NEXT: [[VMULL2_I_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[VECINIT3_I]]) #[[ATTR0]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]]
+;
+entry:
+ %conv = trunc i32 %d to i16
+ %0 = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+ %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+ %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
+ %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
+ %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
+ %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
+ %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
+ ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @vmull_high_n_u32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
+; CHECK-LABEL: define <2 x i64> @vmull_high_n_u32_test(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <4 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], i32 [[D:%.*]]) #[[ATTR6]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> undef, <1 x i32> <i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <2 x i32>
+; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[D]], i32 0
+; CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[D]], i32 1
+; CHECK-NEXT: [[VMULL2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[VECINIT1_I]]) #[[ATTR0]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]]
+;
+entry:
+ %0 = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+ %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+ %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
+ %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
+ %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
+ ret <2 x i64> %vmull2.i.i
+}
+
+define <4 x i32> @vmul_built_dup_test(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: define <4 x i32> @vmul_built_dup_test(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[B]], i32 1
+; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[VGET_LANE]], i32 0
+; CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[VGET_LANE]], i32 1
+; CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[VGET_LANE]], i32 2
+; CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[VGET_LANE]], i32 3
+; CHECK-NEXT: [[PROD:%.*]] = mul <4 x i32> [[A]], [[VECINIT3_I]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[PROD]]
+;
+ %vget_lane = extractelement <4 x i32> %b, i32 1
+ %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0
+ %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1
+ %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2
+ %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3
+ %prod = mul <4 x i32> %a, %vecinit3.i
+ ret <4 x i32> %prod
+}
+
+define <4 x i16> @vmul_built_dup_fromsmall_test(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: define <4 x i16> @vmul_built_dup_fromsmall_test(
+; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[B]], i32 3
+; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i32 0
+; CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[VGET_LANE]], i32 1
+; CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[VGET_LANE]], i32 2
+; CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[VGET_LANE]], i32 3
+; CHECK-NEXT: [[PROD:%.*]] = mul <4 x i16> [[A]], [[VECINIT3_I]]
+; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i16> [[PROD]]
+;
+ %vget_lane = extractelement <4 x i16> %b, i32 3
+ %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
+ %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
+ %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+ %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+ %prod = mul <4 x i16> %a, %vecinit3.i
+ ret <4 x i16> %prod
+}
+
+define <8 x i16> @vmulq_built_dup_fromsmall_test(<8 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: define <8 x i16> @vmulq_built_dup_fromsmall_test(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <4 x i16> [[B:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[B]], i32 0
+; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[VGET_LANE]], i32 0
+; CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[VGET_LANE]], i32 1
+; CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[VGET_LANE]], i32 2
+; CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[VGET_LANE]], i32 3
+; CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[VGET_LANE]], i32 4
+; CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[VGET_LANE]], i32 5
+; CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[VGET_LANE]], i32 6
+; CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[VGET_LANE]], i32 7
+; CHECK-NEXT: [[PROD:%.*]] = mul <8 x i16> [[A]], [[VECINIT7_I]]
+; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i16> [[PROD]]
+;
+ %vget_lane = extractelement <4 x i16> %b, i32 0
+ %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
+ %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
+ %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+ %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+ %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
+ %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
+ %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
+ %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
+ %prod = mul <8 x i16> %a, %vecinit7.i
+ ret <8 x i16> %prod
+}
+
+define <2 x i64> @mull_from_two_extracts(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: define <2 x i64> @mull_from_two_extracts(
+; CHECK-SAME: <4 x i32> [[LHS:%.*]], <4 x i32> [[RHS:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[LHS_HIGH:%.*]] = shufflevector <4 x i32> [[LHS]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[RHS_HIGH:%.*]] = shufflevector <4 x i32> [[RHS]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[RES:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[LHS_HIGH]], <2 x i32> [[RHS_HIGH]]) #[[ATTR0]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[RES]]
+;
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+ %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: define <2 x i64> @mlal_from_two_extracts(
+; CHECK-SAME: <2 x i64> [[ACCUM:%.*]], <4 x i32> [[LHS:%.*]], <4 x i32> [[RHS:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[LHS_HIGH:%.*]] = shufflevector <4 x i32> [[LHS]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[RHS_HIGH:%.*]] = shufflevector <4 x i32> [[RHS]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[RES:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[LHS_HIGH]], <2 x i32> [[RHS_HIGH]]) #[[ATTR0]]
+; CHECK-NEXT: [[SUM:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[ACCUM]], <2 x i64> [[RES]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[SUM]]
+;
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+ %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+ %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
+ ret <2 x i64> %sum
+}
+
+define <2 x i64> @mull_from_extract_dup_low(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: define <2 x i64> @mull_from_extract_dup_low(
+; CHECK-SAME: <4 x i32> [[LHS:%.*]], i32 [[RHS:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[RHSVEC_TMP:%.*]] = insertelement <2 x i32> undef, i32 [[RHS]], i32 0
+; CHECK-NEXT: [[RHSVEC:%.*]] = insertelement <2 x i32> [[RHSVEC_TMP]], i32 [[RHS]], i32 1
+; CHECK-NEXT: [[LHS_HIGH:%.*]] = shufflevector <4 x i32> [[LHS]], <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[RES:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[LHS_HIGH]], <2 x i32> [[RHSVEC]]) #[[ATTR0]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[RES]]
+;
+ %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+ %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+ %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @mull_from_extract_dup_high(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: define <2 x i64> @mull_from_extract_dup_high(
+; CHECK-SAME: <4 x i32> [[LHS:%.*]], i32 [[RHS:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[RHSVEC_TMP:%.*]] = insertelement <2 x i32> undef, i32 [[RHS]], i32 0
+; CHECK-NEXT: [[RHSVEC:%.*]] = insertelement <2 x i32> [[RHSVEC_TMP]], i32 [[RHS]], i32 1
+; CHECK-NEXT: [[LHS_HIGH:%.*]] = shufflevector <4 x i32> [[LHS]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[RES:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[LHS_HIGH]], <2 x i32> [[RHSVEC]]) #[[ATTR0]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[RES]]
+;
+ %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+ %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+ %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+ ret <2 x i64> %res
+}
+
+define <8 x i16> @pmull_from_extract_dup_low(<16 x i8> %lhs, i8 %rhs) {
+; CHECK-LABEL: define <8 x i16> @pmull_from_extract_dup_low(
+; CHECK-SAME: <16 x i8> [[LHS:%.*]], i8 [[RHS:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[RHSVEC_0:%.*]] = insertelement <8 x i8> undef, i8 [[RHS]], i32 0
+; CHECK-NEXT: [[RHSVEC:%.*]] = shufflevector <8 x i8> [[RHSVEC_0]], <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[LHS_HIGH:%.*]] = shufflevector <16 x i8> [[LHS]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[RES:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> [[LHS_HIGH]], <8 x i8> [[RHSVEC]]) #[[ATTR0]]
+; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i16> [[RES]]
+;
+ %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
+ %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+
+ %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+ %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @pmull_from_extract_dup_high(<16 x i8> %lhs, i8 %rhs) {
+; CHECK-LABEL: define <8 x i16> @pmull_from_extract_dup_high(
+; CHECK-SAME: <16 x i8> [[LHS:%.*]], i8 [[RHS:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[RHSVEC_0:%.*]] = insertelement <8 x i8> undef, i8 [[RHS]], i32 0
+; CHECK-NEXT: [[RHSVEC:%.*]] = shufflevector <8 x i8> [[RHSVEC_0]], <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[LHS_HIGH:%.*]] = shufflevector <16 x i8> [[LHS]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[RES:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> [[LHS_HIGH]], <8 x i8> [[RHSVEC]]) #[[ATTR0]]
+; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i16> [[RES]]
+;
+ %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
+ %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+
+ %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+ %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @pmull_from_extract_duplane_low(<16 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK-LABEL: define <8 x i16> @pmull_from_extract_duplane_low(
+; CHECK-SAME: <16 x i8> [[LHS:%.*]], <8 x i8> [[RHS:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[LHS_HIGH:%.*]] = shufflevector <16 x i8> [[LHS]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[RHS_HIGH:%.*]] = shufflevector <8 x i8> [[RHS]], <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[RES:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> [[LHS_HIGH]], <8 x i8> [[RHS_HIGH]]) #[[ATTR0]]
+; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i16> [[RES]]
+;
+ %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+
+ %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @pmull_from_extract_duplane_high(<16 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK-LABEL: define <8 x i16> @pmull_from_extract_duplane_high(
+; CHECK-SAME: <16 x i8> [[LHS:%.*]], <8 x i8> [[RHS:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[LHS_HIGH:%.*]] = shufflevector <16 x i8> [[LHS]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[RHS_HIGH:%.*]] = shufflevector <8 x i8> [[RHS]], <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[RES:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> [[LHS_HIGH]], <8 x i8> [[RHS_HIGH]]) #[[ATTR0]]
+; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i16> [[RES]]
+;
+ %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+
+ %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
+ ret <8 x i16> %res
+}
+
+define <2 x i64> @sqdmull_from_extract_duplane_low(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: define <2 x i64> @sqdmull_from_extract_duplane_low(
+; CHECK-SAME: <4 x i32> [[LHS:%.*]], <4 x i32> [[RHS:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[LHS_HIGH:%.*]] = shufflevector <4 x i32> [[LHS]], <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[RHS_HIGH:%.*]] = shufflevector <4 x i32> [[RHS]], <4 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[RES:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[LHS_HIGH]], <2 x i32> [[RHS_HIGH]]) #[[ATTR0]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[RES]]
+;
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+ %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+ %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @sqdmull_from_extract_duplane_high(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: define <2 x i64> @sqdmull_from_extract_duplane_high(
+; CHECK-SAME: <4 x i32> [[LHS:%.*]], <4 x i32> [[RHS:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[LHS_HIGH:%.*]] = shufflevector <4 x i32> [[LHS]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[RHS_HIGH:%.*]] = shufflevector <4 x i32> [[RHS]], <4 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[RES:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[LHS_HIGH]], <2 x i32> [[RHS_HIGH]]) #[[ATTR0]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[RES]]
+;
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+ %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @sqdmlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: define <2 x i64> @sqdmlal_from_extract_duplane_low(
+; CHECK-SAME: <2 x i64> [[ACCUM:%.*]], <4 x i32> [[LHS:%.*]], <4 x i32> [[RHS:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[LHS_HIGH:%.*]] = shufflevector <4 x i32> [[LHS]], <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[RHS_HIGH:%.*]] = shufflevector <4 x i32> [[RHS]], <4 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[RES:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[LHS_HIGH]], <2 x i32> [[RHS_HIGH]]) #[[ATTR0]]
+; CHECK-NEXT: [[SUM:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[ACCUM]], <2 x i64> [[RES]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[SUM]]
+;
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+ %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+ %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+ %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
+ ret <2 x i64> %sum
+}
+
+define <2 x i64> @sqdmlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: define <2 x i64> @sqdmlal_from_extract_duplane_high(
+; CHECK-SAME: <2 x i64> [[ACCUM:%.*]], <4 x i32> [[LHS:%.*]], <4 x i32> [[RHS:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[LHS_HIGH:%.*]] = shufflevector <4 x i32> [[LHS]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[RHS_HIGH:%.*]] = shufflevector <4 x i32> [[RHS]], <4 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[RES:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[LHS_HIGH]], <2 x i32> [[RHS_HIGH]]) #[[ATTR0]]
+; CHECK-NEXT: [[SUM:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[ACCUM]], <2 x i64> [[RES]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[SUM]]
+;
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+ %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+ %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
+ ret <2 x i64> %sum
+}
+
+define <2 x i64> @umlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: define <2 x i64> @umlal_from_extract_duplane_low(
+; CHECK-SAME: <2 x i64> [[ACCUM:%.*]], <4 x i32> [[LHS:%.*]], <4 x i32> [[RHS:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[LHS_HIGH:%.*]] = shufflevector <4 x i32> [[LHS]], <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[RHS_HIGH:%.*]] = shufflevector <4 x i32> [[RHS]], <4 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[RES:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[LHS_HIGH]], <2 x i32> [[RHS_HIGH]]) #[[ATTR0]]
+; CHECK-NEXT: [[SUM:%.*]] = add <2 x i64> [[ACCUM]], [[RES]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[SUM]]
+;
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+ %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+ %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+ %sum = add <2 x i64> %accum, %res
+ ret <2 x i64> %sum
+}
+
+define <2 x i64> @umlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: define <2 x i64> @umlal_from_extract_duplane_high(
+; CHECK-SAME: <2 x i64> [[ACCUM:%.*]], <4 x i32> [[LHS:%.*]], <4 x i32> [[RHS:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[LHS_HIGH:%.*]] = shufflevector <4 x i32> [[LHS]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[RHS_HIGH:%.*]] = shufflevector <4 x i32> [[RHS]], <4 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[RES:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[LHS_HIGH]], <2 x i32> [[RHS_HIGH]]) #[[ATTR0]]
+; CHECK-NEXT: [[SUM:%.*]] = add <2 x i64> [[ACCUM]], [[RES]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[SUM]]
+;
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+ %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+ %sum = add <2 x i64> %accum, %res
+ ret <2 x i64> %sum
+}
+
+define float @scalar_fmla_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
+; CHECK-LABEL: define float @scalar_fmla_from_extract_v4f32(
+; CHECK-SAME: float [[ACCUM:%.*]], float [[LHS:%.*]], <4 x float> [[RVEC:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[RHS:%.*]] = extractelement <4 x float> [[RVEC]], i32 3
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.fma.f32(float [[LHS]], float [[RHS]], float [[ACCUM]])
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret float [[RES]]
+;
+ %rhs = extractelement <4 x float> %rvec, i32 3
+ %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
+ ret float %res
+}
+
+define float @scalar_fmla_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
+; CHECK-LABEL: define float @scalar_fmla_from_extract_v2f32(
+; CHECK-SAME: float [[ACCUM:%.*]], float [[LHS:%.*]], <2 x float> [[RVEC:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[RHS:%.*]] = extractelement <2 x float> [[RVEC]], i32 1
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.fma.f32(float [[LHS]], float [[RHS]], float [[ACCUM]])
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret float [[RES]]
+;
+ %rhs = extractelement <2 x float> %rvec, i32 1
+ %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
+ ret float %res
+}
+
+define float @scalar_fmls_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
+; CHECK-LABEL: define float @scalar_fmls_from_extract_v4f32(
+; CHECK-SAME: float [[ACCUM:%.*]], float [[LHS:%.*]], <4 x float> [[RVEC:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[RHS_SCAL:%.*]] = extractelement <4 x float> [[RVEC]], i32 3
+; CHECK-NEXT: [[RHS:%.*]] = fsub float -0.000000e+00, [[RHS_SCAL]]
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.fma.f32(float [[LHS]], float [[RHS]], float [[ACCUM]])
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret float [[RES]]
+;
+ %rhs.scal = extractelement <4 x float> %rvec, i32 3
+ %rhs = fsub float -0.0, %rhs.scal
+ %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
+ ret float %res
+}
+
+define float @scalar_fmls_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
+; CHECK-LABEL: define float @scalar_fmls_from_extract_v2f32(
+; CHECK-SAME: float [[ACCUM:%.*]], float [[LHS:%.*]], <2 x float> [[RVEC:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[RHS_SCAL:%.*]] = extractelement <2 x float> [[RVEC]], i32 1
+; CHECK-NEXT: [[RHS:%.*]] = fsub float -0.000000e+00, [[RHS_SCAL]]
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.fma.f32(float [[LHS]], float [[RHS]], float [[ACCUM]])
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret float [[RES]]
+;
+ %rhs.scal = extractelement <2 x float> %rvec, i32 1
+ %rhs = fsub float -0.0, %rhs.scal
+ %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
+ ret float %res
+}
+
+declare float @llvm.fma.f32(float, float, float)
+
+define double @scalar_fmla_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
+; CHECK-LABEL: define double @scalar_fmla_from_extract_v2f64(
+; CHECK-SAME: double [[ACCUM:%.*]], double [[LHS:%.*]], <2 x double> [[RVEC:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[RHS:%.*]] = extractelement <2 x double> [[RVEC]], i32 1
+; CHECK-NEXT: [[RES:%.*]] = call double @llvm.fma.f64(double [[LHS]], double [[RHS]], double [[ACCUM]])
+; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret double [[RES]]
+;
+ %rhs = extractelement <2 x double> %rvec, i32 1
+ %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
+ ret double %res
+}
+
+define double @scalar_fmls_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
+; CHECK-LABEL: define double @scalar_fmls_from_extract_v2f64(
+; CHECK-SAME: double [[ACCUM:%.*]], double [[LHS:%.*]], <2 x double> [[RVEC:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[RHS_SCAL:%.*]] = extractelement <2 x double> [[RVEC]], i32 1
+; CHECK-NEXT: [[RHS:%.*]] = fsub double -0.000000e+00, [[RHS_SCAL]]
+; CHECK-NEXT: [[RES:%.*]] = call double @llvm.fma.f64(double [[LHS]], double [[RHS]], double [[ACCUM]])
+; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret double [[RES]]
+;
+ %rhs.scal = extractelement <2 x double> %rvec, i32 1
+ %rhs = fsub double -0.0, %rhs.scal
+ %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
+ ret double %res
+}
+
+declare double @llvm.fma.f64(double, double, double)
+
+define <2 x float> @fmls_with_fneg_before_extract_v2f32(<2 x float> %accum, <2 x float> %lhs, <4 x float> %rhs) {
+; CHECK-LABEL: define <2 x float> @fmls_with_fneg_before_extract_v2f32(
+; CHECK-SAME: <2 x float> [[ACCUM:%.*]], <2 x float> [[LHS:%.*]], <4 x float> [[RHS:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[RHS_NEG:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[RHS]]
+; CHECK-NEXT: [[SPLAT:%.*]] = shufflevector <4 x float> [[RHS_NEG]], <4 x float> undef, <2 x i32> <i32 3, i32 3>
+; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LHS]], <2 x float> [[SPLAT]], <2 x float> [[ACCUM]])
+; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x float> [[RES]]
+;
+ %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
+ %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+ %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
+ ret <2 x float> %res
+}
+
+define <2 x float> @fmls_with_fneg_before_extract_v2f32_1(<2 x float> %accum, <2 x float> %lhs, <2 x float> %rhs) {
+; CHECK-LABEL: define <2 x float> @fmls_with_fneg_before_extract_v2f32_1(
+; CHECK-SAME: <2 x float> [[ACCUM:%.*]], <2 x float> [[LHS:%.*]], <2 x float> [[RHS:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[RHS_NEG:%.*]] = fsub <2 x float> splat (float -0.000000e+00), [[RHS]]
+; CHECK-NEXT: [[SPLAT:%.*]] = shufflevector <2 x float> [[RHS_NEG]], <2 x float> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LHS]], <2 x float> [[SPLAT]], <2 x float> [[ACCUM]])
+; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x float> [[RES]]
+;
+ %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
+ %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+ %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
+ ret <2 x float> %res
+}
+
+define <4 x float> @fmls_with_fneg_before_extract_v4f32(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) {
+; CHECK-LABEL: define <4 x float> @fmls_with_fneg_before_extract_v4f32(
+; CHECK-SAME: <4 x float> [[ACCUM:%.*]], <4 x float> [[LHS:%.*]], <4 x float> [[RHS:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[RHS_NEG:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[RHS]]
+; CHECK-NEXT: [[SPLAT:%.*]] = shufflevector <4 x float> [[RHS_NEG]], <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LHS]], <4 x float> [[SPLAT]], <4 x float> [[ACCUM]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
+ %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+ %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
+ ret <4 x float> %res
+}
+
+define <4 x float> @fmls_with_fneg_before_extract_v4f32_1(<4 x float> %accum, <4 x float> %lhs, <2 x float> %rhs) {
+; CHECK-LABEL: define <4 x float> @fmls_with_fneg_before_extract_v4f32_1(
+; CHECK-SAME: <4 x float> [[ACCUM:%.*]], <4 x float> [[LHS:%.*]], <2 x float> [[RHS:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[RHS_NEG:%.*]] = fsub <2 x float> splat (float -0.000000e+00), [[RHS]]
+; CHECK-NEXT: [[SPLAT:%.*]] = shufflevector <2 x float> [[RHS_NEG]], <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LHS]], <4 x float> [[SPLAT]], <4 x float> [[ACCUM]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
+ %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
+ ret <4 x float> %res
+}
+
+define <2 x double> @fmls_with_fneg_before_extract_v2f64(<2 x double> %accum, <2 x double> %lhs, <2 x double> %rhs) {
+; CHECK-LABEL: define <2 x double> @fmls_with_fneg_before_extract_v2f64(
+; CHECK-SAME: <2 x double> [[ACCUM:%.*]], <2 x double> [[LHS:%.*]], <2 x double> [[RHS:%.*]]) {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[RHS_NEG:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[RHS]]
+; CHECK-NEXT: [[SPLAT:%.*]] = shufflevector <2 x double> [[RHS_NEG]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LHS]], <2 x double> [[SPLAT]], <2 x double> [[ACCUM]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES]]
+;
+ %rhs_neg = fsub <2 x double> <double -0.0, double -0.0>, %rhs
+ %splat = shufflevector <2 x double> %rhs_neg, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+ %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %lhs, <2 x double> %splat, <2 x double> %accum)
+ ret <2 x double> %res
+}
+
+define <1 x double> @test_fmul_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
+; CHECK-LABEL: define <1 x double> @test_fmul_v1f64(
+; CHECK-SAME: <1 x double> [[L:%.*]], <1 x double> [[R:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[PROD:%.*]] = fmul <1 x double> [[L]], [[R]]
+; CHECK-NEXT: store <1 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <1 x double> [[PROD]]
+;
+ %prod = fmul <1 x double> %L, %R
+ ret <1 x double> %prod
+}
+
+define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
+; CHECK-LABEL: define <1 x double> @test_fdiv_v1f64(
+; CHECK-SAME: <1 x double> [[L:%.*]], <1 x double> [[R:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[PROD:%.*]] = fdiv <1 x double> [[L]], [[R]]
+; CHECK-NEXT: store <1 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <1 x double> [[PROD]]
+;
+ %prod = fdiv <1 x double> %L, %R
+ ret <1 x double> %prod
+}
+
+define i32 @sqdmlal_s(i16 %A, i16 %B, i32 %C) nounwind {
+; CHECK-LABEL: define i32 @sqdmlal_s(
+; CHECK-SAME: i16 [[A:%.*]], i16 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[A]], i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[B]], i64 0
+; CHECK-NEXT: [[TMP3:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP5:%.*]] = tail call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[C]], i32 [[TMP4]])
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[TMP5]]
+;
+ %temp1 = insertelement <4 x i16> undef, i16 %A, i64 0
+ %temp2 = insertelement <4 x i16> undef, i16 %B, i64 0
+ %temp3 = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
+ %temp4 = extractelement <4 x i32> %temp3, i64 0
+ %temp5 = tail call i32 @llvm.aarch64.neon.sqadd.i32(i32 %C, i32 %temp4)
+ ret i32 %temp5
+}
+
+define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind {
+; CHECK-LABEL: define i64 @sqdmlal_d(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[A]], i32 [[B]])
+; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 [[C]], i64 [[TMP4]])
+; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i64 [[TMP5]]
+;
+ %temp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
+ %temp5 = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %C, i64 %temp4)
+ ret i64 %temp5
+}
+
+define i32 @sqdmlsl_s(i16 %A, i16 %B, i32 %C) nounwind {
+; CHECK-LABEL: define i32 @sqdmlsl_s(
+; CHECK-SAME: i16 [[A:%.*]], i16 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[A]], i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[B]], i64 0
+; CHECK-NEXT: [[TMP3:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP5:%.*]] = tail call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[C]], i32 [[TMP4]])
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[TMP5]]
+;
+ %temp1 = insertelement <4 x i16> undef, i16 %A, i64 0
+ %temp2 = insertelement <4 x i16> undef, i16 %B, i64 0
+ %temp3 = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
+ %temp4 = extractelement <4 x i32> %temp3, i64 0
+ %temp5 = tail call i32 @llvm.aarch64.neon.sqsub.i32(i32 %C, i32 %temp4)
+ ret i32 %temp5
+}
+
+define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind {
+; CHECK-LABEL: define i64 @sqdmlsl_d(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[A]], i32 [[B]])
+; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 [[C]], i64 [[TMP4]])
+; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i64 [[TMP5]]
+;
+ %temp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
+ %temp5 = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %C, i64 %temp4)
+ ret i64 %temp5
+}
+
+define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind {
+; CHECK-LABEL: define <16 x i8> @test_pmull_64(
+; CHECK-SAME: i64 [[L:%.*]], i64 [[R:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[VAL:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 [[L]], i64 [[R]])
+; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i8> [[VAL]]
+;
+ %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
+ ret <16 x i8> %val
+}
+
+define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {
+; CHECK-LABEL: define <16 x i8> @test_pmull_high_64(
+; CHECK-SAME: <2 x i64> [[L:%.*]], <2 x i64> [[R:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[L_HI:%.*]] = extractelement <2 x i64> [[L]], i32 1
+; CHECK-NEXT: [[R_HI:%.*]] = extractelement <2 x i64> [[R]], i32 1
+; CHECK-NEXT: [[VAL:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 [[L_HI]], i64 [[R_HI]])
+; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i8> [[VAL]]
+;
+ %l_hi = extractelement <2 x i64> %l, i32 1
+ %r_hi = extractelement <2 x i64> %r, i32 1
+ %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l_hi, i64 %r_hi)
+ ret <16 x i8> %val
+}
+
+declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)
+
+define <1 x i64> @test_mul_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) nounwind {
+; CHECK-LABEL: define <1 x i64> @test_mul_v1i64(
+; CHECK-SAME: <1 x i64> [[LHS:%.*]], <1 x i64> [[RHS:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[PROD:%.*]] = mul <1 x i64> [[LHS]], [[RHS]]
+; CHECK-NEXT: store <1 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <1 x i64> [[PROD]]
+;
+ %prod = mul <1 x i64> %lhs, %rhs
+ ret <1 x i64> %prod
+}
More information about the llvm-commits
mailing list