[clang] 59f063b - NeonEmitter: remove special 'a' type modifier.
Tim Northover via cfe-commits
cfe-commits at lists.llvm.org
Wed Nov 6 02:23:44 PST 2019
Author: Tim Northover
Date: 2019-11-06T10:23:36Z
New Revision: 59f063b89c518ae81467f6015d1c428c61583f71
URL: https://github.com/llvm/llvm-project/commit/59f063b89c518ae81467f6015d1c428c61583f71
DIFF: https://github.com/llvm/llvm-project/commit/59f063b89c518ae81467f6015d1c428c61583f71.diff
LOG: NeonEmitter: remove special 'a' type modifier.
'a' used to implement a splat in C++ code in NeonEmitter.cpp, but this
can be done directly from .td expansions now (and most ops already did).
So removing it simplifies the overall code.
https://reviews.llvm.org/D69716
Added:
Modified:
clang/include/clang/Basic/arm_neon.td
clang/include/clang/Basic/arm_neon_incl.td
clang/test/CodeGen/aarch64-neon-2velem.c
clang/test/CodeGen/arm_neon_intrinsics.c
clang/utils/TableGen/NeonEmitter.cpp
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index a52ed496580d..127c5af97ce6 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -53,6 +53,7 @@ def OP_MLAL_N : Op<(op "+", $p0, (call "vmull", $p1, (dup $p2)))>;
def OP_MLSL_N : Op<(op "-", $p0, (call "vmull", $p1, (dup $p2)))>;
def OP_MUL_LN : Op<(op "*", $p0, (splat $p1, $p2))>;
def OP_MULX_LN : Op<(call "vmulx", $p0, (splat $p1, $p2))>;
+def OP_MULL_N : Op<(call "vmull", $p0, (dup $p1))>;
def OP_MULL_LN : Op<(call "vmull", $p0, (splat $p1, $p2))>;
def OP_MULLHi_LN: Op<(call "vmull", (call "vget_high", $p0), (splat $p1, $p2))>;
def OP_MLA_LN : Op<(op "+", $p0, (op "*", $p1, (splat $p2, $p3)))>;
@@ -63,17 +64,22 @@ def OP_MLALHi_LN: Op<(op "+", $p0, (call "vmull", (call "vget_high", $p1),
def OP_MLSL_LN : Op<(op "-", $p0, (call "vmull", $p1, (splat $p2, $p3)))>;
def OP_MLSLHi_LN : Op<(op "-", $p0, (call "vmull", (call "vget_high", $p1),
(splat $p2, $p3)))>;
+def OP_QDMULL_N : Op<(call "vqdmull", $p0, (dup $p1))>;
def OP_QDMULL_LN : Op<(call "vqdmull", $p0, (splat $p1, $p2))>;
def OP_QDMULLHi_LN : Op<(call "vqdmull", (call "vget_high", $p0),
(splat $p1, $p2))>;
+def OP_QDMLAL_N : Op<(call "vqdmlal", $p0, $p1, (dup $p2))>;
def OP_QDMLAL_LN : Op<(call "vqdmlal", $p0, $p1, (splat $p2, $p3))>;
def OP_QDMLALHi_LN : Op<(call "vqdmlal", $p0, (call "vget_high", $p1),
(splat $p2, $p3))>;
+def OP_QDMLSL_N : Op<(call "vqdmlsl", $p0, $p1, (dup $p2))>;
def OP_QDMLSL_LN : Op<(call "vqdmlsl", $p0, $p1, (splat $p2, $p3))>;
def OP_QDMLSLHi_LN : Op<(call "vqdmlsl", $p0, (call "vget_high", $p1),
(splat $p2, $p3))>;
+def OP_QDMULH_N : Op<(call "vqdmulh", $p0, (dup $p1))>;
def OP_QDMULH_LN : Op<(call "vqdmulh", $p0, (splat $p1, $p2))>;
def OP_QRDMULH_LN : Op<(call "vqrdmulh", $p0, (splat $p1, $p2))>;
+def OP_QRDMULH_N : Op<(call "vqrdmulh", $p0, (dup $p1))>;
def OP_QRDMLAH : Op<(call "vqadd", $p0, (call "vqrdmulh", $p1, $p2))>;
def OP_QRDMLSH : Op<(call "vqsub", $p0, (call "vqrdmulh", $p1, $p2))>;
def OP_QRDMLAH_LN : Op<(call "vqadd", $p0, (call "vqrdmulh", $p1, (splat $p2, $p3)))>;
@@ -516,13 +522,13 @@ def VQDMLSL_LANE : SOpInst<"vqdmlsl_lane", "wwddi", "si", OP_QDMLSL_LN>;
def VMUL_N : IOpInst<"vmul_n", "dds", "sifUsUiQsQiQfQUsQUi", OP_MUL_N>;
def VMUL_LANE : IOpInst<"vmul_lane", "ddgi",
"sifUsUiQsQiQfQUsQUi", OP_MUL_LN>;
-def VMULL_N : SInst<"vmull_n", "wda", "siUsUi">;
+def VMULL_N : SOpInst<"vmull_n", "wds", "siUsUi", OP_MULL_N>;
def VMULL_LANE : SOpInst<"vmull_lane", "wddi", "siUsUi", OP_MULL_LN>;
-def VQDMULL_N : SInst<"vqdmull_n", "wda", "si">;
+def VQDMULL_N : SOpInst<"vqdmull_n", "wds", "si", OP_QDMULL_N>;
def VQDMULL_LANE : SOpInst<"vqdmull_lane", "wddi", "si", OP_QDMULL_LN>;
-def VQDMULH_N : SInst<"vqdmulh_n", "dda", "siQsQi">;
+def VQDMULH_N : SOpInst<"vqdmulh_n", "dds", "siQsQi", OP_QDMULH_N>;
def VQDMULH_LANE : SOpInst<"vqdmulh_lane", "ddgi", "siQsQi", OP_QDMULH_LN>;
-def VQRDMULH_N : SInst<"vqrdmulh_n", "dda", "siQsQi">;
+def VQRDMULH_N : SOpInst<"vqrdmulh_n", "dds", "siQsQi", OP_QRDMULH_N>;
def VQRDMULH_LANE : SOpInst<"vqrdmulh_lane", "ddgi", "siQsQi", OP_QRDMULH_LN>;
let ArchGuard = "defined(__ARM_FEATURE_QRDMX)" in {
@@ -530,12 +536,12 @@ def VQRDMLAH_LANE : SOpInst<"vqrdmlah_lane", "dddgi", "siQsQi", OP_QRDMLAH_LN>;
def VQRDMLSH_LANE : SOpInst<"vqrdmlsh_lane", "dddgi", "siQsQi", OP_QRDMLSH_LN>;
}
-def VMLA_N : IOpInst<"vmla_n", "ddda", "siUsUifQsQiQUsQUiQf", OP_MLA_N>;
-def VMLAL_N : SOpInst<"vmlal_n", "wwda", "siUsUi", OP_MLAL_N>;
-def VQDMLAL_N : SInst<"vqdmlal_n", "wwda", "si">;
+def VMLA_N : IOpInst<"vmla_n", "ddds", "siUsUifQsQiQUsQUiQf", OP_MLA_N>;
+def VMLAL_N : SOpInst<"vmlal_n", "wwds", "siUsUi", OP_MLAL_N>;
+def VQDMLAL_N : SOpInst<"vqdmlal_n", "wwds", "si", OP_QDMLAL_N>;
def VMLS_N : IOpInst<"vmls_n", "ddds", "siUsUifQsQiQUsQUiQf", OP_MLS_N>;
-def VMLSL_N : SOpInst<"vmlsl_n", "wwda", "siUsUi", OP_MLSL_N>;
-def VQDMLSL_N : SInst<"vqdmlsl_n", "wwda", "si">;
+def VMLSL_N : SOpInst<"vmlsl_n", "wwds", "siUsUi", OP_MLSL_N>;
+def VQDMLSL_N : SOpInst<"vqdmlsl_n", "wwds", "si", OP_QDMLSL_N>;
////////////////////////////////////////////////////////////////////////////////
// E.3.26 Vector Extract
diff --git a/clang/include/clang/Basic/arm_neon_incl.td b/clang/include/clang/Basic/arm_neon_incl.td
index 4314ed1b8da7..984ed787037f 100644
--- a/clang/include/clang/Basic/arm_neon_incl.td
+++ b/clang/include/clang/Basic/arm_neon_incl.td
@@ -249,7 +249,6 @@ def OP_UNAVAILABLE : Operation {
// s: scalar of element type
// z: scalar of half width element type, signed
// r: scalar of double width element type, signed
-// a: scalar of element type (splat to vector type)
// b: scalar of unsigned integer/long type (int/float args)
// $: scalar of signed integer/long type (int/float args)
// y: scalar of float
diff --git a/clang/test/CodeGen/aarch64-neon-2velem.c b/clang/test/CodeGen/aarch64-neon-2velem.c
index 5db29344a000..915c38c573e2 100644
--- a/clang/test/CodeGen/aarch64-neon-2velem.c
+++ b/clang/test/CodeGen/aarch64-neon-2velem.c
@@ -2784,11 +2784,11 @@ float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
// CHECK-LABEL: @test_vmull_high_n_s16(
// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
// CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
// CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
+// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
// CHECK: [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
// CHECK: ret <4 x i32> [[VMULL5_I_I]]
@@ -2798,9 +2798,9 @@ int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) {
// CHECK-LABEL: @test_vmull_high_n_s32(
// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
-// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
+// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
// CHECK: [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
// CHECK: ret <2 x i64> [[VMULL3_I_I]]
@@ -2810,11 +2810,11 @@ int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) {
// CHECK-LABEL: @test_vmull_high_n_u16(
// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
// CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
// CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
+// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
// CHECK: [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
// CHECK: ret <4 x i32> [[VMULL5_I_I]]
@@ -2824,9 +2824,9 @@ uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) {
// CHECK-LABEL: @test_vmull_high_n_u32(
// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
-// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
+// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
// CHECK: [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
// CHECK: ret <2 x i64> [[VMULL3_I_I]]
@@ -2836,11 +2836,11 @@ uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) {
// CHECK-LABEL: @test_vqdmull_high_n_s16(
// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
// CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
// CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
+// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
// CHECK: [[VQDMULL_V5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
// CHECK: [[VQDMULL_V6_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I_I]] to <16 x i8>
@@ -2851,9 +2851,9 @@ int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) {
// CHECK-LABEL: @test_vqdmull_high_n_s32(
// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
-// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
+// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
// CHECK: [[VQDMULL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
// CHECK: [[VQDMULL_V4_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I_I]] to <16 x i8>
@@ -2920,12 +2920,12 @@ uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
// CHECK-LABEL: @test_vqdmlal_high_n_s16(
// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
// CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
// CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
+// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
// CHECK: [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
// CHECK: [[VQDMLAL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I_I]])
@@ -2936,10 +2936,10 @@ int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
// CHECK-LABEL: @test_vqdmlal_high_n_s32(
// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
-// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
+// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
// CHECK: [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
// CHECK: [[VQDMLAL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I_I]])
@@ -3006,12 +3006,12 @@ uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
// CHECK-LABEL: @test_vqdmlsl_high_n_s16(
// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
// CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
// CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
+// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
// CHECK: [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
// CHECK: [[VQDMLSL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I_I]])
@@ -3022,10 +3022,10 @@ int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
// CHECK-LABEL: @test_vqdmlsl_high_n_s32(
// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
-// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
+// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
// CHECK: [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
// CHECK: [[VQDMLSL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I_I]])
@@ -3233,11 +3233,11 @@ uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
}
// CHECK-LABEL: @test_vmull_n_s16(
-// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
// CHECK: [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
// CHECK: ret <4 x i32> [[VMULL5_I]]
@@ -3246,9 +3246,9 @@ int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
}
// CHECK-LABEL: @test_vmull_n_s32(
-// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
// CHECK: [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
// CHECK: ret <2 x i64> [[VMULL3_I]]
@@ -3257,11 +3257,11 @@ int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
}
// CHECK-LABEL: @test_vmull_n_u16(
-// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
// CHECK: [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
// CHECK: ret <4 x i32> [[VMULL5_I]]
@@ -3270,9 +3270,9 @@ uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
}
// CHECK-LABEL: @test_vmull_n_u32(
-// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
// CHECK: [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
// CHECK: ret <2 x i64> [[VMULL3_I]]
@@ -3281,11 +3281,11 @@ uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
}
// CHECK-LABEL: @test_vqdmull_n_s16(
-// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
// CHECK: [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
// CHECK: [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8>
@@ -3295,9 +3295,9 @@ int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
}
// CHECK-LABEL: @test_vqdmull_n_s32(
-// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
// CHECK: [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
// CHECK: [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8>
@@ -3307,11 +3307,11 @@ int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
}
// CHECK-LABEL: @test_vqdmulh_n_s16(
-// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
// CHECK: [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
// CHECK: [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8>
@@ -3321,7 +3321,6 @@ int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
}
// CHECK-LABEL: @test_vqdmulhq_n_s16(
-// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
@@ -3330,6 +3329,7 @@ int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
// CHECK: [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]])
// CHECK: [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8>
@@ -3339,9 +3339,9 @@ int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
}
// CHECK-LABEL: @test_vqdmulh_n_s32(
-// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
// CHECK: [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
// CHECK: [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8>
@@ -3351,11 +3351,11 @@ int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
}
// CHECK-LABEL: @test_vqdmulhq_n_s32(
-// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
// CHECK: [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]])
// CHECK: [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8>
@@ -3365,11 +3365,11 @@ int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
}
// CHECK-LABEL: @test_vqrdmulh_n_s16(
-// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
// CHECK: [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
// CHECK: [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8>
@@ -3379,7 +3379,6 @@ int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
}
// CHECK-LABEL: @test_vqrdmulhq_n_s16(
-// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
@@ -3388,6 +3387,7 @@ int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
// CHECK: [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]])
// CHECK: [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8>
@@ -3397,9 +3397,9 @@ int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
}
// CHECK-LABEL: @test_vqrdmulh_n_s32(
-// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
// CHECK: [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
// CHECK: [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8>
@@ -3409,11 +3409,11 @@ int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
}
// CHECK-LABEL: @test_vqrdmulhq_n_s32(
-// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
// CHECK: [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]])
// CHECK: [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8>
@@ -3575,12 +3575,12 @@ uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
}
// CHECK-LABEL: @test_vqdmlal_n_s16(
-// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
// CHECK: [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
// CHECK: [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]])
@@ -3590,10 +3590,10 @@ int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
}
// CHECK-LABEL: @test_vqdmlal_n_s32(
-// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
// CHECK: [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
// CHECK: [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]])
@@ -3755,12 +3755,12 @@ uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
}
// CHECK-LABEL: @test_vqdmlsl_n_s16(
-// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
// CHECK: [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
// CHECK: [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]])
@@ -3770,10 +3770,10 @@ int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
}
// CHECK-LABEL: @test_vqdmlsl_n_s32(
-// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
// CHECK: [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
// CHECK: [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]])
diff --git a/clang/test/CodeGen/arm_neon_intrinsics.c b/clang/test/CodeGen/arm_neon_intrinsics.c
index 077f128b7a3e..5c34d1c37de0 100644
--- a/clang/test/CodeGen/arm_neon_intrinsics.c
+++ b/clang/test/CodeGen/arm_neon_intrinsics.c
@@ -8437,11 +8437,11 @@ uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t b) {
}
// CHECK-LABEL: @test_vmull_n_s16(
-// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
// CHECK: [[VMULL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
// CHECK: ret <4 x i32> [[VMULL5_I]]
@@ -8450,9 +8450,9 @@ int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
}
// CHECK-LABEL: @test_vmull_n_s32(
-// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
// CHECK: [[VMULL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
// CHECK: ret <2 x i64> [[VMULL3_I]]
@@ -8461,11 +8461,11 @@ int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
}
// CHECK-LABEL: @test_vmull_n_u16(
-// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
// CHECK: [[VMULL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
// CHECK: ret <4 x i32> [[VMULL5_I]]
@@ -8474,9 +8474,9 @@ uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
}
// CHECK-LABEL: @test_vmull_n_u32(
-// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
// CHECK: [[VMULL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
// CHECK: ret <2 x i64> [[VMULL3_I]]
@@ -9717,12 +9717,12 @@ int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
}
// CHECK-LABEL: @test_vqdmlal_n_s16(
-// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
// CHECK: [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
// CHECK: [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]])
@@ -9732,10 +9732,10 @@ int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
}
// CHECK-LABEL: @test_vqdmlal_n_s32(
-// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
// CHECK: [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
// CHECK: [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]])
@@ -9791,12 +9791,12 @@ int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
}
// CHECK-LABEL: @test_vqdmlsl_n_s16(
-// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
// CHECK: [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
// CHECK: [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]])
@@ -9806,10 +9806,10 @@ int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
}
// CHECK-LABEL: @test_vqdmlsl_n_s32(
-// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
// CHECK: [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
// CHECK: [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]])
@@ -9903,11 +9903,11 @@ int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
}
// CHECK-LABEL: @test_vqdmulh_n_s16(
-// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
// CHECK: [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
// CHECK: [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8>
@@ -9917,9 +9917,9 @@ int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
}
// CHECK-LABEL: @test_vqdmulh_n_s32(
-// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
// CHECK: [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
// CHECK: [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8>
@@ -9929,7 +9929,6 @@ int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
}
// CHECK-LABEL: @test_vqdmulhq_n_s16(
-// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
@@ -9938,6 +9937,7 @@ int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
// CHECK: [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]])
// CHECK: [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8>
@@ -9947,11 +9947,11 @@ int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
}
// CHECK-LABEL: @test_vqdmulhq_n_s32(
-// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
// CHECK: [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]])
// CHECK: [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8>
@@ -10003,11 +10003,11 @@ int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t b) {
}
// CHECK-LABEL: @test_vqdmull_n_s16(
-// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
// CHECK: [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
// CHECK: [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8>
@@ -10017,9 +10017,9 @@ int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
}
// CHECK-LABEL: @test_vqdmull_n_s32(
-// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
// CHECK: [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
// CHECK: [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8>
@@ -10241,11 +10241,11 @@ int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
}
// CHECK-LABEL: @test_vqrdmulh_n_s16(
-// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
// CHECK: [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
// CHECK: [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8>
@@ -10255,9 +10255,9 @@ int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
}
// CHECK-LABEL: @test_vqrdmulh_n_s32(
-// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
// CHECK: [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
// CHECK: [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8>
@@ -10267,7 +10267,6 @@ int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
}
// CHECK-LABEL: @test_vqrdmulhq_n_s16(
-// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
@@ -10276,6 +10275,7 @@ int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
// CHECK: [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]])
// CHECK: [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8>
@@ -10285,11 +10285,11 @@ int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
}
// CHECK-LABEL: @test_vqrdmulhq_n_s32(
-// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
// CHECK: [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]])
// CHECK: [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8>
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index ecd932475cb1..bb893bc49f63 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -412,17 +412,6 @@ class Intrinsic {
return Idx - 1;
}
- /// Return true if the intrinsic takes an splat operand.
- bool hasSplat() const { return Proto.find('a') != std::string::npos; }
-
- /// Return the parameter index of the splat operand.
- unsigned getSplatIdx() const {
- assert(hasSplat());
- unsigned Idx = Proto.find('a');
- assert(Idx > 0 && "Can't return a splat!");
- return Idx - 1;
- }
-
unsigned getNumParams() const { return Proto.size() - 1; }
Type getReturnType() const { return Types[0]; }
Type getParamType(unsigned I) const { return Types[I + 1]; }
@@ -431,7 +420,6 @@ class Intrinsic {
std::string getProto() const { return Proto; }
/// Return true if the prototype has a scalar argument.
- /// This does not return true for the "splat" code ('a').
bool protoHasScalar() const;
/// Return the index that parameter PIndex will sit at
@@ -950,7 +938,6 @@ void Type::applyModifier(char Mod) {
NumVectors = 0;
break;
case 's':
- case 'a':
Bitwidth = ElementBitwidth;
NumVectors = 0;
break;
@@ -1354,8 +1341,6 @@ void Intrinsic::emitShadowedArgs() {
}
}
-// We don't check 'a' in this function, because for builtin function the
-// argument matching to 'a' uses a vector type splatted from a scalar type.
bool Intrinsic::protoHasScalar() const {
return (Proto.find('s') != std::string::npos ||
Proto.find('z') != std::string::npos ||
@@ -1374,12 +1359,6 @@ void Intrinsic::emitBodyAsBuiltinCall() {
bool SRet = getReturnType().getNumVectors() >= 2;
StringRef N = Name;
- if (hasSplat()) {
- // Call the non-splat builtin: chop off the "_n" suffix from the name.
- assert(N.endswith("_n"));
- N = N.drop_back(2);
- }
-
ClassKind LocalCK = CK;
if (!protoHasScalar())
LocalCK = ClassB;
@@ -1413,21 +1392,8 @@ void Intrinsic::emitBodyAsBuiltinCall() {
continue;
}
- std::string Arg;
+ std::string Arg = V.getName();
Type CastToType = T;
- if (hasSplat() && I == getSplatIdx()) {
- Arg = "(" + BaseType.str() + ") {";
- for (unsigned J = 0; J < BaseType.getNumElements(); ++J) {
- if (J != 0)
- Arg += ", ";
- Arg += V.getName();
- }
- Arg += "}";
-
- CastToType = BaseType;
- } else {
- Arg = V.getName();
- }
// Check if an explicit cast is needed.
if (CastToType.isVector() &&
@@ -2096,10 +2062,6 @@ void NeonEmitter::genBuiltinsDef(raw_ostream &OS,
for (auto *Def : Defs) {
if (Def->hasBody())
continue;
- // Functions with 'a' (the splat code) in the type prototype should not get
- // their own builtin as they use the non-splat variant.
- if (Def->hasSplat())
- continue;
std::string S = "BUILTIN(__builtin_neon_" + Def->getMangledName() + ", \"";
@@ -2136,10 +2098,6 @@ void NeonEmitter::genOverloadTypeCheckCode(raw_ostream &OS,
// __builtin_neon_* so we don't need to generate a definition for it.
if (Def->hasBody())
continue;
- // Functions with 'a' (the splat code) in the type prototype should not get
- // their own builtin as they use the non-splat variant.
- if (Def->hasSplat())
- continue;
// Functions which have a scalar argument cannot be overloaded, no need to
// check them if we are emitting the type checking code.
if (Def->protoHasScalar())
@@ -2220,10 +2178,6 @@ void NeonEmitter::genIntrinsicRangeCheckCode(raw_ostream &OS,
for (auto *Def : Defs) {
if (Def->hasBody())
continue;
- // Functions with 'a' (the splat code) in the type prototype should not get
- // their own builtin as they use the non-splat variant.
- if (Def->hasSplat())
- continue;
// Functions which do not have an immediate do not need to have range
// checking code emitted.
if (!Def->hasImmediate())
More information about the cfe-commits
mailing list