[clang] 62ab15f - [ARM] Setting missing isLaneQ attribute on Neon Intrisics definitions

Thu Mar 19 04:52:50 PDT 2020

Author: Lucas Prates
Date: 2020-03-19T11:52:41Z
New Revision: 62ab15ffa3f910c36758e99324deac12ee006c90

URL: https://github.com/llvm/llvm-project/commit/62ab15ffa3f910c36758e99324deac12ee006c90
DIFF: https://github.com/llvm/llvm-project/commit/62ab15ffa3f910c36758e99324deac12ee006c90.diff

LOG: [ARM] Setting missing isLaneQ attribute on Neon Intrisics definitions

Summary:
Some of the `*_laneq` intrinsics defined in arm_neon.td were missing the
setting of the `isLaneQ` attribute. This patch sets the attribute on the
related definitions, as they will be required to properly perform range
checks on their lane arguments.

Reviewers: jmolloy, t.p.northover, rsmith, olista01, dnsampaio

Reviewed By: dnsampaio

Subscribers: dnsampaio, kristof.beyls, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D74616

Added: 
    clang/test/CodeGen/arm-neon-range-checks.c

Modified: 
    clang/include/clang/Basic/arm_neon.td
    clang/include/clang/Basic/arm_neon_incl.td
    clang/lib/CodeGen/CGBuiltin.cpp
    clang/lib/CodeGen/CodeGenFunction.h
    clang/test/CodeGen/aarch64-neon-2velem.c
    clang/test/CodeGen/aarch64-neon-fma.c
    clang/test/CodeGen/aarch64-poly64.c
    clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
    clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c
    clang/test/CodeGen/arm64_vdupq_n_f64.c
    clang/test/CodeGen/arm_neon_intrinsics.c
    clang/utils/TableGen/NeonEmitter.cpp

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 380a2a0a5fa6..f949edc378fc 100644

--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -51,39 +51,39 @@ def OP_FMLA_N   : Op<(call "vfma", $p0, $p1, (dup $p2))>;
 def OP_FMLS_N   : Op<(call "vfma", $p0, (op "-", $p1), (dup $p2))>;
 def OP_MLAL_N   : Op<(op "+", $p0, (call "vmull", $p1, (dup $p2)))>;
 def OP_MLSL_N   : Op<(op "-", $p0, (call "vmull", $p1, (dup $p2)))>;
-def OP_MUL_LN   : Op<(op "*", $p0, (splat $p1, $p2))>;
-def OP_MULX_LN  : Op<(call "vmulx", $p0, (splat $p1, $p2))>;
+def OP_MUL_LN   : Op<(op "*", $p0, (call_mangled "splat_lane", $p1, $p2))>;
+def OP_MULX_LN  : Op<(call "vmulx", $p0, (call_mangled "splat_lane", $p1, $p2))>;
 def OP_MULL_N  : Op<(call "vmull", $p0, (dup $p1))>;
-def OP_MULL_LN  : Op<(call "vmull", $p0, (splat $p1, $p2))>;
-def OP_MULLHi_LN: Op<(call "vmull", (call "vget_high", $p0), (splat $p1, $p2))>;
-def OP_MLA_LN   : Op<(op "+", $p0, (op "*", $p1, (splat $p2, $p3)))>;
-def OP_MLS_LN   : Op<(op "-", $p0, (op "*", $p1, (splat $p2, $p3)))>;
-def OP_MLAL_LN  : Op<(op "+", $p0, (call "vmull", $p1, (splat $p2, $p3)))>;
+def OP_MULL_LN  : Op<(call "vmull", $p0, (call_mangled "splat_lane", $p1, $p2))>;
+def OP_MULLHi_LN: Op<(call "vmull", (call "vget_high", $p0), (call_mangled "splat_lane", $p1, $p2))>;
+def OP_MLA_LN   : Op<(op "+", $p0, (op "*", $p1, (call_mangled "splat_lane", $p2, $p3)))>;
+def OP_MLS_LN   : Op<(op "-", $p0, (op "*", $p1, (call_mangled "splat_lane", $p2, $p3)))>;
+def OP_MLAL_LN  : Op<(op "+", $p0, (call "vmull", $p1, (call_mangled "splat_lane", $p2, $p3)))>;
 def OP_MLALHi_LN: Op<(op "+", $p0, (call "vmull", (call "vget_high", $p1),
-                                                  (splat $p2, $p3)))>;
-def OP_MLSL_LN  : Op<(op "-", $p0, (call "vmull", $p1, (splat $p2, $p3)))>;
+                                                  (call_mangled "splat_lane", $p2, $p3)))>;
+def OP_MLSL_LN  : Op<(op "-", $p0, (call "vmull", $p1, (call_mangled "splat_lane", $p2, $p3)))>;
 def OP_MLSLHi_LN : Op<(op "-", $p0, (call "vmull", (call "vget_high", $p1),
-                                                   (splat $p2, $p3)))>;
+                                                   (call_mangled "splat_lane", $p2, $p3)))>;
 def OP_QDMULL_N : Op<(call "vqdmull", $p0, (dup $p1))>;
-def OP_QDMULL_LN : Op<(call "vqdmull", $p0, (splat $p1, $p2))>;
+def OP_QDMULL_LN : Op<(call "vqdmull", $p0, (call_mangled "splat_lane", $p1, $p2))>;
 def OP_QDMULLHi_LN : Op<(call "vqdmull", (call "vget_high", $p0),
-                                         (splat $p1, $p2))>;
+                                         (call_mangled "splat_lane", $p1, $p2))>;
 def OP_QDMLAL_N : Op<(call "vqdmlal", $p0, $p1, (dup $p2))>;
-def OP_QDMLAL_LN : Op<(call "vqdmlal", $p0, $p1, (splat $p2, $p3))>;
+def OP_QDMLAL_LN : Op<(call "vqdmlal", $p0, $p1, (call_mangled "splat_lane", $p2, $p3))>;
 def OP_QDMLALHi_LN : Op<(call "vqdmlal", $p0, (call "vget_high", $p1),
-                                              (splat $p2, $p3))>;
+                                              (call_mangled "splat_lane", $p2, $p3))>;
 def OP_QDMLSL_N : Op<(call "vqdmlsl", $p0, $p1, (dup $p2))>;
-def OP_QDMLSL_LN : Op<(call "vqdmlsl", $p0, $p1, (splat $p2, $p3))>;
+def OP_QDMLSL_LN : Op<(call "vqdmlsl", $p0, $p1, (call_mangled "splat_lane", $p2, $p3))>;
 def OP_QDMLSLHi_LN : Op<(call "vqdmlsl", $p0, (call "vget_high", $p1),
-                                              (splat $p2, $p3))>;
+                                              (call_mangled "splat_lane", $p2, $p3))>;
 def OP_QDMULH_N : Op<(call "vqdmulh", $p0, (dup $p1))>;
-def OP_QDMULH_LN : Op<(call "vqdmulh", $p0, (splat $p1, $p2))>;
-def OP_QRDMULH_LN : Op<(call "vqrdmulh", $p0, (splat $p1, $p2))>;
+def OP_QDMULH_LN : Op<(call "vqdmulh", $p0, (call_mangled "splat_lane", $p1, $p2))>;
+def OP_QRDMULH_LN : Op<(call "vqrdmulh", $p0, (call_mangled "splat_lane", $p1, $p2))>;
 def OP_QRDMULH_N : Op<(call "vqrdmulh", $p0, (dup $p1))>;
 def OP_QRDMLAH : Op<(call "vqadd", $p0, (call "vqrdmulh", $p1, $p2))>;
 def OP_QRDMLSH : Op<(call "vqsub", $p0, (call "vqrdmulh", $p1, $p2))>;
-def OP_QRDMLAH_LN : Op<(call "vqadd", $p0, (call "vqrdmulh", $p1, (splat $p2, $p3)))>;
-def OP_QRDMLSH_LN : Op<(call "vqsub", $p0, (call "vqrdmulh", $p1, (splat $p2, $p3)))>;
+def OP_QRDMLAH_LN : Op<(call "vqadd", $p0, (call "vqrdmulh", $p1, (call_mangled "splat_lane", $p2, $p3)))>;
+def OP_QRDMLSH_LN : Op<(call "vqsub", $p0, (call "vqrdmulh", $p1, (call_mangled "splat_lane", $p2, $p3)))>;
 def OP_FMS_LN   : Op<(call "vfma_lane", $p0, (op "-", $p1), $p2, $p3)>;
 def OP_FMS_LNQ  : Op<(call "vfma_laneq", $p0, (op "-", $p1), $p2, $p3)>;
 def OP_TRN1     : Op<(shuffle $p0, $p1, (interleave (decimate mask0, 2),
@@ -115,7 +115,7 @@ def OP_HI       : Op<(shuffle $p0, $p0, (highhalf mask0))>;
 def OP_LO       : Op<(shuffle $p0, $p0, (lowhalf mask0))>;
 def OP_CONC     : Op<(shuffle $p0, $p1, (add mask0, mask1))>;
 def OP_DUP      : Op<(dup $p0)>;
-def OP_DUP_LN   : Op<(splat $p0, $p1)>;
+def OP_DUP_LN   : Op<(call_mangled "splat_lane", $p0, $p1)>;
 def OP_SEL      : Op<(cast "R", (op "|",
                                     (op "&", $p0, (cast $p0, $p1)),
                                     (op "&", (op "~", $p0), (cast $p0, $p2))))>;
@@ -207,10 +207,10 @@ def OP_SCALAR_HALF_SET_LNQ : Op<(bitcast "float16x8_t",
 
 def OP_DOT_LN
     : Op<(call "vdot", $p0, $p1,
-          (bitcast $p1, (splat(bitcast "uint32x2_t", $p2), $p3)))>;
+          (bitcast $p1, (call_mangled "splat_lane", (bitcast "32", $p2), $p3)))>;
 def OP_DOT_LNQ
     : Op<(call "vdot", $p0, $p1,
-          (bitcast $p1, (splat(bitcast "uint32x4_t", $p2), $p3)))>;
+          (bitcast $p1, (call_mangled "splat_lane", (bitcast "32", $p2), $p3)))>;
 
 def OP_FMLAL_LN     : Op<(call "vfmlal_low", $p0, $p1,
                            (dup_typed $p1, (call "vget_lane", $p2, $p3)))>;
@@ -222,7 +222,19 @@ def OP_FMLSL_LN_Hi  : Op<(call "vfmlsl_high", $p0, $p1,
                            (dup_typed $p1, (call "vget_lane", $p2, $p3)))>;
 
 //===----------------------------------------------------------------------===//
-// Instructions
+// Auxiliary Instructions
+//===----------------------------------------------------------------------===//
+
+// Splat operation - performs a range-checked splat over a vector
+def SPLAT  : WInst<"splat_lane", ".(!q)I",
+                   "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPl">;
+def SPLATQ : WInst<"splat_laneq", ".(!Q)I",
+                   "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPl"> {
+  let isLaneQ = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Intrinsics
 //===----------------------------------------------------------------------===//
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -535,8 +547,8 @@ def VQDMULH_LANE  : SOpInst<"vqdmulh_lane", "..qI", "siQsQi", OP_QDMULH_LN>;
 def VQRDMULH_LANE : SOpInst<"vqrdmulh_lane", "..qI", "siQsQi", OP_QRDMULH_LN>;
 }
 let ArchGuard = "defined(__aarch64__)" in {
-def A64_VQDMULH_LANE  : SInst<"vqdmulh_lane", "..qI", "siQsQi">;
-def A64_VQRDMULH_LANE : SInst<"vqrdmulh_lane", "..qI", "siQsQi">;
+def A64_VQDMULH_LANE  : SInst<"vqdmulh_lane", "..(!q)I", "siQsQi">;
+def A64_VQRDMULH_LANE : SInst<"vqrdmulh_lane", "..(!q)I", "siQsQi">;
 }
 
 let ArchGuard = "defined(__ARM_FEATURE_QRDMX)" in {
@@ -881,16 +893,22 @@ def COPY_LANE : IOpInst<"vcopy_lane", "..I.I",
 def COPYQ_LANE : IOpInst<"vcopy_lane", "..IqI",
                         "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPl", OP_COPY_LN>;
 def COPY_LANEQ : IOpInst<"vcopy_laneq", "..IQI",
-                     "csilPcPsPlUcUsUiUlfd", OP_COPY_LN>;
+                     "csilPcPsPlUcUsUiUlfd", OP_COPY_LN> {
+  let isLaneQ = 1;
+}
 def COPYQ_LANEQ : IOpInst<"vcopy_laneq", "..I.I",
-                     "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPl", OP_COPY_LN>;
+                     "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPl", OP_COPY_LN> {
+  let isLaneQ = 1;
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 // Set all lanes to same value
 def VDUP_LANE1: WOpInst<"vdup_lane", ".qI", "hdQhQdPlQPl", OP_DUP_LN>;
 def VDUP_LANE2: WOpInst<"vdup_laneq", ".QI",
                   "csilUcUsUiUlPcPshfdQcQsQiQlQPcQPsQUcQUsQUiQUlQhQfQdPlQPl",
-                        OP_DUP_LN>;
+                        OP_DUP_LN> {
+  let isLaneQ = 1;
+}
 def DUP_N   : WOpInst<"vdup_n", ".1", "dQdPlQPl", OP_DUP>;
 def MOV_N   : WOpInst<"vmov_n", ".1", "dQdPlQPl", OP_DUP>;
 
@@ -906,38 +924,60 @@ def CREATE : NoTestOpInst<"vcreate", ".(IU>)", "dPl", OP_CAST> {
 ////////////////////////////////////////////////////////////////////////////////
 
 def VMLA_LANEQ   : IOpInst<"vmla_laneq", "...QI",
-                           "siUsUifQsQiQUsQUiQf", OP_MLA_LN>;
+                           "siUsUifQsQiQUsQUiQf", OP_MLA_LN> {
+  let isLaneQ = 1;
+}
 def VMLS_LANEQ   : IOpInst<"vmls_laneq", "...QI",
-                           "siUsUifQsQiQUsQUiQf", OP_MLS_LN>;
+                           "siUsUifQsQiQUsQUiQf", OP_MLS_LN> {
+  let isLaneQ = 1;
+}
 
 def VFMA_LANE    : IInst<"vfma_lane", "...qI", "fdQfQd">;
 def VFMA_LANEQ   : IInst<"vfma_laneq", "...QI", "fdQfQd"> {
   let isLaneQ = 1;
 }
 def VFMS_LANE    : IOpInst<"vfms_lane", "...qI", "fdQfQd", OP_FMS_LN>;
-def VFMS_LANEQ   : IOpInst<"vfms_laneq", "...QI", "fdQfQd", OP_FMS_LNQ>;
+def VFMS_LANEQ   : IOpInst<"vfms_laneq", "...QI", "fdQfQd", OP_FMS_LNQ> {
+  let isLaneQ = 1;
+}
 
-def VMLAL_LANEQ  : SOpInst<"vmlal_laneq", "(>Q)(>Q).QI", "siUsUi", OP_MLAL_LN>;
+def VMLAL_LANEQ  : SOpInst<"vmlal_laneq", "(>Q)(>Q).QI", "siUsUi", OP_MLAL_LN> {
+  let isLaneQ = 1;
+}
 def VMLAL_HIGH_LANE   : SOpInst<"vmlal_high_lane", "(>Q)(>Q)Q.I", "siUsUi",
                                 OP_MLALHi_LN>;
 def VMLAL_HIGH_LANEQ  : SOpInst<"vmlal_high_laneq", "(>Q)(>Q)QQI", "siUsUi",
-                                OP_MLALHi_LN>;
-def VMLSL_LANEQ  : SOpInst<"vmlsl_laneq", "(>Q)(>Q).QI", "siUsUi", OP_MLSL_LN>;
+                                OP_MLALHi_LN> {
+  let isLaneQ = 1;
+}
+def VMLSL_LANEQ  : SOpInst<"vmlsl_laneq", "(>Q)(>Q).QI", "siUsUi", OP_MLSL_LN> {
+  let isLaneQ = 1;
+}
 def VMLSL_HIGH_LANE   : SOpInst<"vmlsl_high_lane", "(>Q)(>Q)Q.I", "siUsUi",
                                 OP_MLSLHi_LN>;
 def VMLSL_HIGH_LANEQ  : SOpInst<"vmlsl_high_laneq", "(>Q)(>Q)QQI", "siUsUi",
-                                OP_MLSLHi_LN>;
+                                OP_MLSLHi_LN> {
+  let isLaneQ = 1;
+}
 
-def VQDMLAL_LANEQ  : SOpInst<"vqdmlal_laneq", "(>Q)(>Q).QI", "si", OP_QDMLAL_LN>;
+def VQDMLAL_LANEQ  : SOpInst<"vqdmlal_laneq", "(>Q)(>Q).QI", "si", OP_QDMLAL_LN> {
+  let isLaneQ = 1;
+}
 def VQDMLAL_HIGH_LANE   : SOpInst<"vqdmlal_high_lane", "(>Q)(>Q)Q.I", "si",
                                 OP_QDMLALHi_LN>;
 def VQDMLAL_HIGH_LANEQ  : SOpInst<"vqdmlal_high_laneq", "(>Q)(>Q)QQI", "si",
-                                OP_QDMLALHi_LN>;
-def VQDMLSL_LANEQ  : SOpInst<"vqdmlsl_laneq", "(>Q)(>Q).QI", "si", OP_QDMLSL_LN>;
+                                OP_QDMLALHi_LN> {
+  let isLaneQ = 1;
+}
+def VQDMLSL_LANEQ  : SOpInst<"vqdmlsl_laneq", "(>Q)(>Q).QI", "si", OP_QDMLSL_LN> {
+  let isLaneQ = 1;
+}
 def VQDMLSL_HIGH_LANE   : SOpInst<"vqdmlsl_high_lane", "(>Q)(>Q)Q.I", "si",
                                 OP_QDMLSLHi_LN>;
 def VQDMLSL_HIGH_LANEQ  : SOpInst<"vqdmlsl_high_laneq", "(>Q)(>Q)QQI", "si",
-                                OP_QDMLSLHi_LN>;
+                                OP_QDMLSLHi_LN> {
+  let isLaneQ = 1;
+}
 
 // Newly add double parameter for vmul_lane in aarch64
 // Note: d type is handled by SCALAR_VMUL_LANE
@@ -945,32 +985,48 @@ def VMUL_LANE_A64 : IOpInst<"vmul_lane", "..qI", "Qd", OP_MUL_LN>;
 
 // Note: d type is handled by SCALAR_VMUL_LANEQ
 def VMUL_LANEQ   : IOpInst<"vmul_laneq", "..QI",
-                           "sifUsUiQsQiQUsQUiQfQd", OP_MUL_LN>;
-def VMULL_LANEQ  : SOpInst<"vmull_laneq", "(>Q).QI", "siUsUi", OP_MULL_LN>;
+                           "sifUsUiQsQiQUsQUiQfQd", OP_MUL_LN> {
+  let isLaneQ = 1;
+}
+def VMULL_LANEQ  : SOpInst<"vmull_laneq", "(>Q).QI", "siUsUi", OP_MULL_LN> {
+  let isLaneQ = 1;
+}
 def VMULL_HIGH_LANE   : SOpInst<"vmull_high_lane", "(>Q)Q.I", "siUsUi",
                                 OP_MULLHi_LN>;
 def VMULL_HIGH_LANEQ  : SOpInst<"vmull_high_laneq", "(>Q)QQI", "siUsUi",
-                                OP_MULLHi_LN>;
+                                OP_MULLHi_LN> {
+  let isLaneQ = 1;
+}
 
-def VQDMULL_LANEQ  : SOpInst<"vqdmull_laneq", "(>Q).QI", "si", OP_QDMULL_LN>;
+def VQDMULL_LANEQ  : SOpInst<"vqdmull_laneq", "(>Q).QI", "si", OP_QDMULL_LN> {
+  let isLaneQ = 1;
+}
 def VQDMULL_HIGH_LANE   : SOpInst<"vqdmull_high_lane", "(>Q)Q.I", "si",
                                   OP_QDMULLHi_LN>;
 def VQDMULL_HIGH_LANEQ  : SOpInst<"vqdmull_high_laneq", "(>Q)QQI", "si",
-                                  OP_QDMULLHi_LN>;
+                                  OP_QDMULLHi_LN> {
+  let isLaneQ = 1;
+}
 
 let isLaneQ = 1 in {
 def VQDMULH_LANEQ  : SInst<"vqdmulh_laneq", "..QI", "siQsQi">;
 def VQRDMULH_LANEQ : SInst<"vqrdmulh_laneq", "..QI", "siQsQi">;
 }
 let ArchGuard = "defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__)" in {
-def VQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "...QI", "siQsQi", OP_QRDMLAH_LN>;
-def VQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "...QI", "siQsQi", OP_QRDMLSH_LN>;
+def VQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "...QI", "siQsQi", OP_QRDMLAH_LN> {
+  let isLaneQ = 1;
+}
+def VQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "...QI", "siQsQi", OP_QRDMLSH_LN> {
+  let isLaneQ = 1;
+}
 }
 
 // Note: d type implemented by SCALAR_VMULX_LANE
 def VMULX_LANE : IOpInst<"vmulx_lane", "..qI", "fQfQd", OP_MULX_LN>;
 // Note: d type is implemented by SCALAR_VMULX_LANEQ
-def VMULX_LANEQ : IOpInst<"vmulx_laneq", "..QI", "fQfQd", OP_MULX_LN>;
+def VMULX_LANEQ : IOpInst<"vmulx_laneq", "..QI", "fQfQd", OP_MULX_LN> {
+  let isLaneQ = 1;
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 // Across vectors class
@@ -1380,11 +1436,15 @@ def SCALAR_UQXTN : SInst<"vqmovn", "(1<)1", "SUsSUiSUl">;
 
 // Scalar Floating Point  multiply (scalar, by element)
 def SCALAR_FMUL_LANE : IOpInst<"vmul_lane", "11.I", "SfSd", OP_SCALAR_MUL_LN>;
-def SCALAR_FMUL_LANEQ : IOpInst<"vmul_laneq", "11QI", "SfSd", OP_SCALAR_MUL_LN>;
+def SCALAR_FMUL_LANEQ : IOpInst<"vmul_laneq", "11QI", "SfSd", OP_SCALAR_MUL_LN> {
+  let isLaneQ = 1;
+}
 
 // Scalar Floating Point  multiply extended (scalar, by element)
 def SCALAR_FMULX_LANE : IOpInst<"vmulx_lane", "11.I", "SfSd", OP_SCALAR_MULX_LN>;
-def SCALAR_FMULX_LANEQ : IOpInst<"vmulx_laneq", "11QI", "SfSd", OP_SCALAR_MULX_LN>;
+def SCALAR_FMULX_LANEQ : IOpInst<"vmulx_laneq", "11QI", "SfSd", OP_SCALAR_MULX_LN> {
+  let isLaneQ = 1;
+}
 
 def SCALAR_VMUL_N : IInst<"vmul_n", "..1", "d">;
 
@@ -1400,48 +1460,70 @@ def SCALAR_VMUL_LANEQ   : IInst<"vmul_laneq", "..QI", "d"> {
 def SCALAR_VMULX_LANE : IOpInst<"vmulx_lane", "..qI", "d", OP_SCALAR_VMULX_LN>;
 
 // VMULX_LANEQ d type implemented using scalar vmulx_laneq
-def SCALAR_VMULX_LANEQ : IOpInst<"vmulx_laneq", "..QI", "d", OP_SCALAR_VMULX_LNQ>;
+def SCALAR_VMULX_LANEQ : IOpInst<"vmulx_laneq", "..QI", "d", OP_SCALAR_VMULX_LNQ> {
+  let isLaneQ = 1;
+}
 
 // Scalar Floating Point fused multiply-add (scalar, by element)
 def SCALAR_FMLA_LANE : IInst<"vfma_lane", "111.I", "SfSd">;
-def SCALAR_FMLA_LANEQ : IInst<"vfma_laneq", "111QI", "SfSd">;
+def SCALAR_FMLA_LANEQ : IInst<"vfma_laneq", "111QI", "SfSd"> {
+  let isLaneQ = 1;
+}
 
 // Scalar Floating Point fused multiply-subtract (scalar, by element)
 def SCALAR_FMLS_LANE : IOpInst<"vfms_lane", "111.I", "SfSd", OP_FMS_LN>;
-def SCALAR_FMLS_LANEQ : IOpInst<"vfms_laneq", "111QI", "SfSd", OP_FMS_LNQ>;
+def SCALAR_FMLS_LANEQ : IOpInst<"vfms_laneq", "111QI", "SfSd", OP_FMS_LNQ> {
+  let isLaneQ = 1;
+}
 
 // Signed Saturating Doubling Multiply Long (scalar by element)
 def SCALAR_SQDMULL_LANE : SOpInst<"vqdmull_lane", "(1>)1.I", "SsSi", OP_SCALAR_QDMULL_LN>;
-def SCALAR_SQDMULL_LANEQ : SOpInst<"vqdmull_laneq", "(1>)1QI", "SsSi", OP_SCALAR_QDMULL_LN>;
+def SCALAR_SQDMULL_LANEQ : SOpInst<"vqdmull_laneq", "(1>)1QI", "SsSi", OP_SCALAR_QDMULL_LN> {
+  let isLaneQ = 1;
+}
 
 // Signed Saturating Doubling Multiply-Add Long (scalar by element)
 def SCALAR_SQDMLAL_LANE : SInst<"vqdmlal_lane", "(1>)(1>)1.I", "SsSi">;
-def SCALAR_SQDMLAL_LANEQ : SInst<"vqdmlal_laneq", "(1>)(1>)1QI", "SsSi">;
+def SCALAR_SQDMLAL_LANEQ : SInst<"vqdmlal_laneq", "(1>)(1>)1QI", "SsSi"> {
+  let isLaneQ = 1;
+}
 
 // Signed Saturating Doubling Multiply-Subtract Long (scalar by element)
 def SCALAR_SQDMLS_LANE : SInst<"vqdmlsl_lane", "(1>)(1>)1.I", "SsSi">;
-def SCALAR_SQDMLS_LANEQ : SInst<"vqdmlsl_laneq", "(1>)(1>)1QI", "SsSi">;
+def SCALAR_SQDMLS_LANEQ : SInst<"vqdmlsl_laneq", "(1>)(1>)1QI", "SsSi"> {
+  let isLaneQ = 1;
+}
 
 // Scalar Integer Saturating Doubling Multiply Half High (scalar by element)
 def SCALAR_SQDMULH_LANE : SOpInst<"vqdmulh_lane", "11.I", "SsSi", OP_SCALAR_QDMULH_LN>;
-def SCALAR_SQDMULH_LANEQ : SOpInst<"vqdmulh_laneq", "11QI", "SsSi", OP_SCALAR_QDMULH_LN>;
+def SCALAR_SQDMULH_LANEQ : SOpInst<"vqdmulh_laneq", "11QI", "SsSi", OP_SCALAR_QDMULH_LN> {
+  let isLaneQ = 1;
+}
 
 // Scalar Integer Saturating Rounding Doubling Multiply Half High
 def SCALAR_SQRDMULH_LANE : SOpInst<"vqrdmulh_lane", "11.I", "SsSi", OP_SCALAR_QRDMULH_LN>;
-def SCALAR_SQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "11QI", "SsSi", OP_SCALAR_QRDMULH_LN>;
+def SCALAR_SQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "11QI", "SsSi", OP_SCALAR_QRDMULH_LN> {
+  let isLaneQ = 1;
+}
 
 let ArchGuard = "defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__)" in {
 // Signed Saturating Rounding Doubling Multiply Accumulate Returning High Half
 def SCALAR_SQRDMLAH_LANE : SOpInst<"vqrdmlah_lane", "111.I", "SsSi", OP_SCALAR_QRDMLAH_LN>;
-def SCALAR_SQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "111QI", "SsSi", OP_SCALAR_QRDMLAH_LN>;
+def SCALAR_SQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "111QI", "SsSi", OP_SCALAR_QRDMLAH_LN> {
+  let isLaneQ = 1;
+}
 
 // Signed Saturating Rounding Doubling Multiply Subtract Returning High Half
 def SCALAR_SQRDMLSH_LANE : SOpInst<"vqrdmlsh_lane", "111.I", "SsSi", OP_SCALAR_QRDMLSH_LN>;
-def SCALAR_SQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "111QI", "SsSi", OP_SCALAR_QRDMLSH_LN>;
+def SCALAR_SQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "111QI", "SsSi", OP_SCALAR_QRDMLSH_LN> {
+  let isLaneQ = 1;
+}
 }
 
 def SCALAR_VDUP_LANE : IInst<"vdup_lane", "1.I", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs">;
-def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "1QI", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs">;
+def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "1QI", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs"> {
+  let isLaneQ = 1;
+}
 }
 
 // ARMv8.2-A FP16 vector intrinsics for A32/A64.
@@ -1605,36 +1687,52 @@ let ArchGuard = "defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(__aarc
 
   // FMA lane
   def VFMA_LANEH   : IInst<"vfma_lane", "...qI", "hQh">;
-  def VFMA_LANEQH  : IInst<"vfma_laneq", "...QI", "hQh">;
+  def VFMA_LANEQH  : IInst<"vfma_laneq", "...QI", "hQh"> {
+    let isLaneQ = 1;
+  }
 
   // FMA lane with scalar argument
   def FMLA_NH      : SOpInst<"vfma_n", "...1", "hQh", OP_FMLA_N>;
   // Scalar floating point fused multiply-add (scalar, by element)
   def SCALAR_FMLA_LANEH  : IInst<"vfma_lane", "111.I", "Sh">;
-  def SCALAR_FMLA_LANEQH : IInst<"vfma_laneq", "111QI", "Sh">;
+  def SCALAR_FMLA_LANEQH : IInst<"vfma_laneq", "111QI", "Sh"> {
+    let isLaneQ = 1;
+  }
 
   // FMS lane
   def VFMS_LANEH   : IOpInst<"vfms_lane", "...qI", "hQh", OP_FMS_LN>;
-  def VFMS_LANEQH  : IOpInst<"vfms_laneq", "...QI", "hQh", OP_FMS_LNQ>;
+  def VFMS_LANEQH  : IOpInst<"vfms_laneq", "...QI", "hQh", OP_FMS_LNQ> {
+    let isLaneQ = 1;
+  }
   // FMS lane with scalar argument
   def FMLS_NH      : SOpInst<"vfms_n", "...1", "hQh", OP_FMLS_N>;
   // Scalar floating foint fused multiply-subtract (scalar, by element)
   def SCALAR_FMLS_LANEH  : IOpInst<"vfms_lane", "111.I", "Sh", OP_FMS_LN>;
-  def SCALAR_FMLS_LANEQH : IOpInst<"vfms_laneq", "111QI", "Sh", OP_FMS_LNQ>;
+  def SCALAR_FMLS_LANEQH : IOpInst<"vfms_laneq", "111QI", "Sh", OP_FMS_LNQ> {
+    let isLaneQ = 1;
+  }
 
   // Mul lane
-  def VMUL_LANEQH   : IOpInst<"vmul_laneq", "..QI", "hQh", OP_MUL_LN>;
+  def VMUL_LANEQH   : IOpInst<"vmul_laneq", "..QI", "hQh", OP_MUL_LN> {
+    let isLaneQ = 1;
+  }
   // Scalar floating point  multiply (scalar, by element)
   def SCALAR_FMUL_LANEH  : IOpInst<"vmul_lane", "11.I", "Sh", OP_SCALAR_MUL_LN>;
-  def SCALAR_FMUL_LANEQH : IOpInst<"vmul_laneq", "11QI", "Sh", OP_SCALAR_MUL_LN>;
+  def SCALAR_FMUL_LANEQH : IOpInst<"vmul_laneq", "11QI", "Sh", OP_SCALAR_MUL_LN> {
+    let isLaneQ = 1;
+  }
 
   // Mulx lane
   def VMULX_LANEH   : IOpInst<"vmulx_lane", "..qI", "hQh", OP_MULX_LN>;
-  def VMULX_LANEQH  : IOpInst<"vmulx_laneq", "..QI", "hQh", OP_MULX_LN>;
+  def VMULX_LANEQH  : IOpInst<"vmulx_laneq", "..QI", "hQh", OP_MULX_LN> {
+    let isLaneQ = 1;
+  }
   def VMULX_NH      : IOpInst<"vmulx_n", "..1", "hQh", OP_MULX_N>;
   // Scalar floating point  mulx (scalar, by element)
   def SCALAR_FMULX_LANEH : IInst<"vmulx_lane", "11.I", "Sh">;
-  def SCALAR_FMULX_LANEQH : IInst<"vmulx_laneq", "11QI", "Sh">;
+  def SCALAR_FMULX_LANEQH : IInst<"vmulx_laneq", "11QI", "Sh"> {
+    let isLaneQ = 1;
+  }
 
   // ARMv8.2-A FP16 reduction vector intrinsics.
   def VMAXVH   : SInst<"vmaxv", "1.", "hQh">;
@@ -1651,7 +1749,9 @@ let ArchGuard = "defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(__aarc
   def VUZP2H     : SOpInst<"vuzp2", "...", "hQh", OP_UZP2>;
 
   def SCALAR_VDUP_LANEH  : IInst<"vdup_lane", "1.I", "Sh">;
-  def SCALAR_VDUP_LANEQH : IInst<"vdup_laneq", "1QI", "Sh">;
+  def SCALAR_VDUP_LANEQH : IInst<"vdup_laneq", "1QI", "Sh"> {
+    let isLaneQ = 1;
+  }
 }
 
 // v8.2-A dot product instructions.
@@ -1661,7 +1761,9 @@ let ArchGuard = "defined(__ARM_FEATURE_DOTPROD)" in {
 }
 let ArchGuard = "defined(__ARM_FEATURE_DOTPROD) && defined(__aarch64__)" in {
   // Variants indexing into a 128-bit vector are A64 only.
-  def UDOT_LANEQ : SOpInst<"vdot_laneq", "..(<<)(<<Q)I", "iUiQiQUi", OP_DOT_LNQ>;
+  def UDOT_LANEQ : SOpInst<"vdot_laneq", "..(<<)(<<Q)I", "iUiQiQUi", OP_DOT_LNQ> {
+    let isLaneQ = 1;
+  }
 }
 
 // v8.2-A FP16 fused multiply-add long instructions.
@@ -1676,10 +1778,18 @@ let ArchGuard = "defined(__ARM_FEATURE_FP16FML) && defined(__aarch64__)" in {
   def VFMLAL_LANE_HIGH : SOpInst<"vfmlal_lane_high", "(F>)(F>)F(Fq)I", "hQh", OP_FMLAL_LN_Hi>;
   def VFMLSL_LANE_HIGH : SOpInst<"vfmlsl_lane_high", "(F>)(F>)F(Fq)I", "hQh", OP_FMLSL_LN_Hi>;
 
-  def VFMLAL_LANEQ_LOW  : SOpInst<"vfmlal_laneq_low",  "(F>)(F>)F(FQ)I", "hQh", OP_FMLAL_LN>;
-  def VFMLSL_LANEQ_LOW  : SOpInst<"vfmlsl_laneq_low",  "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN>;
-  def VFMLAL_LANEQ_HIGH : SOpInst<"vfmlal_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLAL_LN_Hi>;
-  def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN_Hi>;
+  def VFMLAL_LANEQ_LOW  : SOpInst<"vfmlal_laneq_low",  "(F>)(F>)F(FQ)I", "hQh", OP_FMLAL_LN> {
+    let isLaneQ = 1;
+  }
+  def VFMLSL_LANEQ_LOW  : SOpInst<"vfmlsl_laneq_low",  "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN> {
+    let isLaneQ = 1;
+  }
+  def VFMLAL_LANEQ_HIGH : SOpInst<"vfmlal_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLAL_LN_Hi> {
+    let isLaneQ = 1;
+  }
+  def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN_Hi> {
+    let isLaneQ = 1;
+  }
 }
 
 // v8.3-A Vector complex addition intrinsics

diff  --git a/clang/include/clang/Basic/arm_neon_incl.td b/clang/include/clang/Basic/arm_neon_incl.td
index 28b00d162a00..d817e7acb913 100644
--- a/clang/include/clang/Basic/arm_neon_incl.td
+++ b/clang/include/clang/Basic/arm_neon_incl.td
@@ -60,6 +60,15 @@ def op;
 // example: (call "vget_high", $p0) -> "vgetq_high_s16(__p0)"
 //            (assuming $p0 has type int16x8_t).
 def call;
+// call_mangled - Invoke another intrinsic matching the mangled name variation
+//                of the caller's base type. If there is no intrinsic defined
+//                that has the variation and takes the given types, an error
+//                is generated at tblgen time.
+// example: (call_mangled "vfma_lane", $p0, $p1) -> "vfma_lane(__p0, __p1)"
+//            (assuming non-LaneQ caller)
+//          (call_mangled "vfma_lane", $p0, $p1) -> "vfma_laneq(__p0, __p1)"
+//            (assuming LaneQ caller)
+def call_mangled;
 // cast - Perform a cast to a 
diff erent type. This gets emitted as a static
 //        C-style cast. For a pure reinterpret cast (T x = *(T*)&y), use
 //        "bitcast".
@@ -79,6 +88,7 @@ def call;
 //          - "D" - Double the number of lanes in the type.
 //          - "8" - Convert type to an equivalent vector of 8-bit signed
 //                  integers.
+//          - "32" - Convert type to an equivalent vector of 32-bit integers.
 // example: (cast "R", "U", $p0) -> "(uint32x4_t)__p0" (assuming the return
 //           value is of type "int32x4_t".
 //          (cast $p0, "D", "8", $p1) -> "(int8x16_t)__p1" (assuming __p0
@@ -100,12 +110,6 @@ def dup;
 // example: (dup_typed $p1, $p2) -> "(float16x4_t) {__p2, __p2, __p2, __p2}"
 //          (assuming __p1 is float16x4_t, and __p2 is a compatible scalar).
 def dup_typed;
-// splat - Take a vector and a lane index, and return a vector of the same type
-//         containing repeated instances of the source vector at the lane index.
-// example: (splat $p0, $p1) ->
-//            "__builtin_shufflevector(__p0, __p0, __p1, __p1, __p1, __p1)"
-//          (assuming __p0 has four elements).
-def splat;
 // save_temp - Create a temporary (local) variable. The variable takes a name
 //             based on the zero'th parameter and can be referenced using
 //             using that name in subsequent DAGs in the same

diff  --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index b0be1ecb5454..e42339dbcfcc 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4495,10 +4495,15 @@ static llvm::VectorType *GetFloatNeonType(CodeGenFunction *CGF,
   }
 }
 
+Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C,
+                                      const ElementCount &Count) {
+  Value *SV = llvm::ConstantVector::getSplat(Count, C);
+  return Builder.CreateShuffleVector(V, V, SV, "lane");
+}
+
 Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C) {
   ElementCount EC = V->getType()->getVectorElementCount();
-  Value *SV = llvm::ConstantVector::getSplat(EC, C);
-  return Builder.CreateShuffleVector(V, V, SV, "lane");
+  return EmitNeonSplat(V, C, EC);
 }
 
 Value *CodeGenFunction::EmitNeonCall(Function *F, SmallVectorImpl<Value*> &Ops,
@@ -4605,6 +4610,10 @@ struct ARMVectorIntrinsicInfo {
       TypeModifier }
 
 static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
+  NEONMAP0(splat_lane_v),
+  NEONMAP0(splat_laneq_v),
+  NEONMAP0(splatq_lane_v),
+  NEONMAP0(splatq_laneq_v),
   NEONMAP2(vabd_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
   NEONMAP2(vabdq_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
   NEONMAP1(vabs_v, arm_neon_vabs, 0),
@@ -4886,6 +4895,10 @@ static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
 };
 
 static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
+  NEONMAP0(splat_lane_v),
+  NEONMAP0(splat_laneq_v),
+  NEONMAP0(splatq_lane_v),
+  NEONMAP0(splatq_laneq_v),
   NEONMAP1(vabs_v, aarch64_neon_abs, 0),
   NEONMAP1(vabsq_v, aarch64_neon_abs, 0),
   NEONMAP0(vaddhn_v),
@@ -5460,6 +5473,19 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
 
   switch (BuiltinID) {
   default: break;
+  case NEON::BI__builtin_neon_splat_lane_v:
+  case NEON::BI__builtin_neon_splat_laneq_v:
+  case NEON::BI__builtin_neon_splatq_lane_v:
+  case NEON::BI__builtin_neon_splatq_laneq_v: {
+    auto NumElements = VTy->getElementCount();
+    if (BuiltinID == NEON::BI__builtin_neon_splatq_lane_v)
+      NumElements = NumElements * 2;
+    if (BuiltinID == NEON::BI__builtin_neon_splat_laneq_v)
+      NumElements = NumElements / 2;
+
+    Ops[0] = Builder.CreateBitCast(Ops[0], VTy);
+    return EmitNeonSplat(Ops[0], cast<ConstantInt>(Ops[1]), NumElements);
+  }
   case NEON::BI__builtin_neon_vpadd_v:
   case NEON::BI__builtin_neon_vpaddq_v:
     // We don't allow fp/int overloading of intrinsics.
@@ -5798,9 +5824,14 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
   case NEON::BI__builtin_neon_vqdmulh_lane_v:
   case NEON::BI__builtin_neon_vqrdmulhq_lane_v:
   case NEON::BI__builtin_neon_vqrdmulh_lane_v: {
+    llvm::Type *RTy = Ty;
+    if (BuiltinID == NEON::BI__builtin_neon_vqdmulhq_lane_v ||
+        BuiltinID == NEON::BI__builtin_neon_vqrdmulhq_lane_v)
+      RTy = llvm::VectorType::get(Ty->getVectorElementType(),
+                                  Ty->getVectorNumElements() * 2);
     llvm::Type *Tys[2] = {
-        Ty, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
-                                            /*isQuad*/ false))};
+        RTy, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
+                                             /*isQuad*/ false))};
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
   }
   case NEON::BI__builtin_neon_vqdmulhq_laneq_v:

diff  --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 440b088330ef..0128a07030c5 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -3894,6 +3894,8 @@ class CodeGenFunction : public CodeGenTypeCache {
                             SmallVectorImpl<llvm::Value*> &O,
                             const char *name,
                             unsigned shift = 0, bool rightshift = false);
+  llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx,
+                             const llvm::ElementCount &Count);
   llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx);
   llvm::Value *EmitNeonShiftVector(llvm::Value *V, llvm::Type *Ty,
                                    bool negateForRightShift);

diff  --git a/clang/test/CodeGen/aarch64-neon-2velem.c b/clang/test/CodeGen/aarch64-neon-2velem.c
index 5ad06cfff808..25c0ae4988b7 100644
--- a/clang/test/CodeGen/aarch64-neon-2velem.c
+++ b/clang/test/CodeGen/aarch64-neon-2velem.c
@@ -7,8 +7,10 @@
 
 // CHECK-LABEL: @test_vmla_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
 //
@@ -18,8 +20,10 @@ int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlaq_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
 //
@@ -29,8 +33,10 @@ int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmla_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
 //
@@ -40,8 +46,10 @@ int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlaq_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -51,8 +59,10 @@ int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmla_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
 //
@@ -62,8 +72,10 @@ int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlaq_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
 //
@@ -73,8 +85,10 @@ int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmla_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
 //
@@ -84,8 +98,10 @@ int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlaq_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -95,8 +111,10 @@ int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmls_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
 //
@@ -106,8 +124,10 @@ int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlsq_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
 //
@@ -117,8 +137,10 @@ int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmls_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
 //
@@ -128,8 +150,10 @@ int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlsq_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -139,8 +163,10 @@ int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmls_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
 //
@@ -150,8 +176,10 @@ int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlsq_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
 //
@@ -161,8 +189,10 @@ int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmls_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
 //
@@ -172,8 +202,10 @@ int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlsq_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -183,8 +215,10 @@ int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmul_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
 //
 int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) {
@@ -193,8 +227,10 @@ int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmulq_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
 //
 int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) {
@@ -203,8 +239,10 @@ int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmul_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
 //
 int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) {
@@ -213,8 +251,10 @@ int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmulq_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
 //
 int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) {
@@ -223,8 +263,10 @@ int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmul_lane_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
 //
 uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) {
@@ -233,8 +275,10 @@ uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmulq_lane_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
 //
 uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) {
@@ -243,8 +287,10 @@ uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmul_lane_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
 //
 uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) {
@@ -253,8 +299,10 @@ uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmulq_lane_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
 //
 uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) {
@@ -263,8 +311,10 @@ uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmul_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
 //
 int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) {
@@ -273,8 +323,10 @@ int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmulq_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
 //
 int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) {
@@ -283,8 +335,10 @@ int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmul_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
 //
 int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) {
@@ -293,8 +347,10 @@ int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmulq_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
 //
 int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) {
@@ -303,8 +359,10 @@ int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmul_laneq_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
 //
 uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) {
@@ -313,8 +371,10 @@ uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmulq_laneq_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
 //
 uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) {
@@ -323,8 +383,10 @@ uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmul_laneq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
 //
 uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) {
@@ -333,8 +395,10 @@ uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vmulq_laneq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
 //
 uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) {
@@ -584,10 +648,12 @@ float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) {
 
 // CHECK-LABEL: @test_vmlal_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -597,10 +663,12 @@ int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlal_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -610,10 +678,12 @@ int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlal_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -623,10 +693,12 @@ int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlal_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -637,10 +709,12 @@ int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
 // CHECK-LABEL: @test_vmlal_high_lane_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -651,10 +725,12 @@ int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
 // CHECK-LABEL: @test_vmlal_high_lane_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -665,10 +741,12 @@ int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
 // CHECK-LABEL: @test_vmlal_high_laneq_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -679,10 +757,12 @@ int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
 // CHECK-LABEL: @test_vmlal_high_laneq_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -692,10 +772,12 @@ int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlsl_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -705,10 +787,12 @@ int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlsl_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -718,10 +802,12 @@ int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlsl_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -731,10 +817,12 @@ int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlsl_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -745,10 +833,12 @@ int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
 // CHECK-LABEL: @test_vmlsl_high_lane_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -759,10 +849,12 @@ int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
 // CHECK-LABEL: @test_vmlsl_high_lane_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -773,10 +865,12 @@ int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
 // CHECK-LABEL: @test_vmlsl_high_laneq_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -787,10 +881,12 @@ int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
 // CHECK-LABEL: @test_vmlsl_high_laneq_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -800,10 +896,12 @@ int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlal_lane_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -813,10 +911,12 @@ int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlal_lane_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -826,10 +926,12 @@ int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlal_laneq_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -839,10 +941,12 @@ int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlal_laneq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -853,10 +957,12 @@ int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
 // CHECK-LABEL: @test_vmlal_high_lane_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -867,10 +973,12 @@ int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
 // CHECK-LABEL: @test_vmlal_high_lane_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -881,10 +989,12 @@ int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
 // CHECK-LABEL: @test_vmlal_high_laneq_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -895,10 +1005,12 @@ int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
 // CHECK-LABEL: @test_vmlal_high_laneq_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -908,10 +1020,12 @@ int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlsl_lane_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -921,10 +1035,12 @@ int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlsl_lane_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -934,10 +1050,12 @@ int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlsl_laneq_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -947,10 +1065,12 @@ int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlsl_laneq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -961,10 +1081,12 @@ int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
 // CHECK-LABEL: @test_vmlsl_high_lane_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -975,10 +1097,12 @@ int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
 // CHECK-LABEL: @test_vmlsl_high_lane_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -989,10 +1113,12 @@ int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
 // CHECK-LABEL: @test_vmlsl_high_laneq_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -1003,10 +1129,12 @@ int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
 // CHECK-LABEL: @test_vmlsl_high_laneq_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -1016,10 +1144,12 @@ int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmull_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) {
@@ -1028,10 +1158,12 @@ int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmull_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) {
@@ -1040,10 +1172,12 @@ int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmull_lane_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) {
@@ -1052,10 +1186,12 @@ uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmull_lane_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) {
@@ -1065,10 +1201,12 @@ uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) {
 // CHECK-LABEL: @test_vmull_high_lane_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) {
@@ -1078,10 +1216,12 @@ int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) {
 // CHECK-LABEL: @test_vmull_high_lane_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) {
@@ -1091,10 +1231,12 @@ int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) {
 // CHECK-LABEL: @test_vmull_high_lane_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) {
@@ -1104,10 +1246,12 @@ uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) {
 // CHECK-LABEL: @test_vmull_high_lane_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) {
@@ -1116,10 +1260,12 @@ uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmull_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) {
@@ -1128,10 +1274,12 @@ int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmull_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) {
@@ -1140,10 +1288,12 @@ int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmull_laneq_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) {
@@ -1152,10 +1302,12 @@ uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmull_laneq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) {
@@ -1165,10 +1317,12 @@ uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) {
 // CHECK-LABEL: @test_vmull_high_laneq_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
@@ -1178,10 +1332,12 @@ int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
 // CHECK-LABEL: @test_vmull_high_laneq_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
@@ -1191,10 +1347,12 @@ int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
 // CHECK-LABEL: @test_vmull_high_laneq_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) {
@@ -1204,10 +1362,12 @@ uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) {
 // CHECK-LABEL: @test_vmull_high_laneq_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) {
@@ -1216,11 +1376,13 @@ uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlal_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
 //
@@ -1230,11 +1392,13 @@ int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlal_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
 //
@@ -1245,11 +1409,13 @@ int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
 // CHECK-LABEL: @test_vqdmlal_high_lane_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
 //
@@ -1260,11 +1426,13 @@ int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
 // CHECK-LABEL: @test_vqdmlal_high_lane_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
 //
@@ -1274,11 +1442,13 @@ int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqdmlsl_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
 //
@@ -1288,11 +1458,13 @@ int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlsl_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
 //
@@ -1303,11 +1475,13 @@ int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
 // CHECK-LABEL: @test_vqdmlsl_high_lane_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
 //
@@ -1318,11 +1492,13 @@ int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
 // CHECK-LABEL: @test_vqdmlsl_high_lane_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
 //
@@ -1332,10 +1508,12 @@ int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqdmull_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
 //
@@ -1345,10 +1523,12 @@ int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqdmull_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
 //
@@ -1358,10 +1538,12 @@ int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqdmull_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
 //
@@ -1371,10 +1553,12 @@ int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqdmull_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
 //
@@ -1385,10 +1569,12 @@ int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) {
 // CHECK-LABEL: @test_vqdmull_high_lane_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
 //
@@ -1399,10 +1585,12 @@ int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) {
 // CHECK-LABEL: @test_vqdmull_high_lane_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
 //
@@ -1413,10 +1601,12 @@ int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) {
 // CHECK-LABEL: @test_vqdmull_high_laneq_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
 //
@@ -1427,10 +1617,12 @@ int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
 // CHECK-LABEL: @test_vqdmull_high_laneq_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
 //
@@ -1544,8 +1736,10 @@ int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmul_lane_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x float> [[MUL]]
 //
 float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) {
@@ -1568,11 +1762,12 @@ float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) {
   return vmul_lane_f64(a, v, 0);
 }
 
-
 // CHECK-LABEL: @test_vmulq_lane_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x float> [[MUL]]
 //
 float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) {
@@ -1581,8 +1776,10 @@ float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) {
 
 // CHECK-LABEL: @test_vmulq_lane_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <1 x double> [[V:%.*]], <1 x double> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x double> [[MUL]]
 //
 float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) {
@@ -1591,8 +1788,10 @@ float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) {
 
 // CHECK-LABEL: @test_vmul_laneq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x float> [[MUL]]
 //
 float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) {
@@ -1614,11 +1813,12 @@ float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) {
   return vmul_laneq_f64(a, v, 1);
 }
 
-
 // CHECK-LABEL: @test_vmulq_laneq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x float> [[MUL]]
 //
 float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) {
@@ -1627,8 +1827,10 @@ float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) {
 
 // CHECK-LABEL: @test_vmulq_laneq_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x double> [[MUL]]
 //
 float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) {
@@ -1637,10 +1839,12 @@ float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) {
 
 // CHECK-LABEL: @test_vmulx_lane_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4
 // CHECK-NEXT:    ret <2 x float> [[VMULX2_I]]
 //
 float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) {
@@ -1649,10 +1853,12 @@ float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) {
 
 // CHECK-LABEL: @test_vmulxq_lane_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4
 // CHECK-NEXT:    ret <4 x float> [[VMULX2_I]]
 //
 float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) {
@@ -1661,10 +1867,12 @@ float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) {
 
 // CHECK-LABEL: @test_vmulxq_lane_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <1 x double> [[V:%.*]], <1 x double> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4
 // CHECK-NEXT:    ret <2 x double> [[VMULX2_I]]
 //
 float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) {
@@ -1673,10 +1881,12 @@ float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) {
 
 // CHECK-LABEL: @test_vmulx_laneq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4
 // CHECK-NEXT:    ret <2 x float> [[VMULX2_I]]
 //
 float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) {
@@ -1685,10 +1895,12 @@ float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) {
 
 // CHECK-LABEL: @test_vmulxq_laneq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4
 // CHECK-NEXT:    ret <4 x float> [[VMULX2_I]]
 //
 float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) {
@@ -1697,10 +1909,12 @@ float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) {
 
 // CHECK-LABEL: @test_vmulxq_laneq_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4
 // CHECK-NEXT:    ret <2 x double> [[VMULX2_I]]
 //
 float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) {
@@ -1709,8 +1923,10 @@ float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) {
 
 // CHECK-LABEL: @test_vmla_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
 //
@@ -1720,8 +1936,10 @@ int16x4_t test_vmla_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlaq_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
 //
@@ -1731,8 +1949,10 @@ int16x8_t test_vmlaq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmla_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
 //
@@ -1742,8 +1962,10 @@ int32x2_t test_vmla_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlaq_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -1753,8 +1975,10 @@ int32x4_t test_vmlaq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmla_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
 //
@@ -1764,8 +1988,10 @@ int16x4_t test_vmla_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlaq_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
 //
@@ -1775,8 +2001,10 @@ int16x8_t test_vmlaq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmla_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
 //
@@ -1786,8 +2014,10 @@ int32x2_t test_vmla_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlaq_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -1797,8 +2027,10 @@ int32x4_t test_vmlaq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmls_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
 //
@@ -1808,8 +2040,10 @@ int16x4_t test_vmls_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlsq_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
 //
@@ -1819,8 +2053,10 @@ int16x8_t test_vmlsq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmls_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
 //
@@ -1830,8 +2066,10 @@ int32x2_t test_vmls_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlsq_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -1841,8 +2079,10 @@ int32x4_t test_vmlsq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmls_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
 //
@@ -1852,8 +2092,10 @@ int16x4_t test_vmls_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlsq_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
 //
@@ -1863,8 +2105,10 @@ int16x8_t test_vmlsq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmls_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
 //
@@ -1874,8 +2118,10 @@ int32x2_t test_vmls_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlsq_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -1885,8 +2131,10 @@ int32x4_t test_vmlsq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmul_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
 //
 int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) {
@@ -1895,8 +2143,10 @@ int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmulq_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
 //
 int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) {
@@ -1905,8 +2155,10 @@ int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmul_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
 //
 int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) {
@@ -1915,8 +2167,10 @@ int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmulq_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
 //
 int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) {
@@ -1925,8 +2179,10 @@ int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmul_lane_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
 //
 uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) {
@@ -1935,8 +2191,10 @@ uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmulq_lane_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
 //
 uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) {
@@ -1945,8 +2203,10 @@ uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmul_lane_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
 //
 uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) {
@@ -1955,8 +2215,10 @@ uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmulq_lane_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
 //
 uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) {
@@ -1965,8 +2227,10 @@ uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmul_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
 //
 int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) {
@@ -1975,8 +2239,10 @@ int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmulq_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
 //
 int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) {
@@ -1985,8 +2251,10 @@ int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmul_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
 //
 int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) {
@@ -1995,8 +2263,10 @@ int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmulq_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
 //
 int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) {
@@ -2005,8 +2275,10 @@ int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmul_laneq_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
 //
 uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
@@ -2015,8 +2287,10 @@ uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmulq_laneq_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
 //
 uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
@@ -2025,8 +2299,10 @@ uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmul_laneq_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
 //
 uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
@@ -2035,8 +2311,10 @@ uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vmulq_laneq_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
 //
 uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
@@ -2210,10 +2488,12 @@ float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v)
 
 // CHECK-LABEL: @test_vmlal_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -2223,10 +2503,12 @@ int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlal_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -2236,10 +2518,12 @@ int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlal_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -2249,10 +2533,12 @@ int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlal_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -2263,10 +2549,12 @@ int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
 // CHECK-LABEL: @test_vmlal_high_lane_s16_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -2277,10 +2565,12 @@ int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
 // CHECK-LABEL: @test_vmlal_high_lane_s32_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -2291,10 +2581,12 @@ int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
 // CHECK-LABEL: @test_vmlal_high_laneq_s16_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -2305,10 +2597,12 @@ int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
 // CHECK-LABEL: @test_vmlal_high_laneq_s32_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -2318,10 +2612,12 @@ int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlsl_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -2331,10 +2627,12 @@ int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlsl_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -2344,10 +2642,12 @@ int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlsl_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -2357,10 +2657,12 @@ int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlsl_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -2371,10 +2673,12 @@ int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
 // CHECK-LABEL: @test_vmlsl_high_lane_s16_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -2385,10 +2689,12 @@ int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
 // CHECK-LABEL: @test_vmlsl_high_lane_s32_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -2399,10 +2705,12 @@ int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
 // CHECK-LABEL: @test_vmlsl_high_laneq_s16_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -2413,10 +2721,12 @@ int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
 // CHECK-LABEL: @test_vmlsl_high_laneq_s32_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -2426,10 +2736,12 @@ int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlal_lane_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -2439,10 +2751,12 @@ int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlal_lane_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -2452,10 +2766,12 @@ int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlal_laneq_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -2465,10 +2781,12 @@ int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlal_laneq_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -2479,10 +2797,12 @@ int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
 // CHECK-LABEL: @test_vmlal_high_lane_u16_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -2493,10 +2813,12 @@ int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
 // CHECK-LABEL: @test_vmlal_high_lane_u32_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -2507,10 +2829,12 @@ int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
 // CHECK-LABEL: @test_vmlal_high_laneq_u16_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -2521,10 +2845,12 @@ int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
 // CHECK-LABEL: @test_vmlal_high_laneq_u32_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -2534,10 +2860,12 @@ int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlsl_lane_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -2547,10 +2875,12 @@ int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlsl_lane_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -2560,10 +2890,12 @@ int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlsl_laneq_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -2573,10 +2905,12 @@ int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlsl_laneq_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -2587,10 +2921,12 @@ int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
 // CHECK-LABEL: @test_vmlsl_high_lane_u16_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -2601,10 +2937,12 @@ int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
 // CHECK-LABEL: @test_vmlsl_high_lane_u32_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -2615,10 +2953,12 @@ int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
 // CHECK-LABEL: @test_vmlsl_high_laneq_u16_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -2629,10 +2969,12 @@ int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
 // CHECK-LABEL: @test_vmlsl_high_laneq_u32_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -2642,10 +2984,12 @@ int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmull_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) {
@@ -2654,10 +2998,12 @@ int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmull_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) {
@@ -2666,10 +3012,12 @@ int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmull_lane_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) {
@@ -2678,10 +3026,12 @@ uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmull_lane_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) {
@@ -2691,10 +3041,12 @@ uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) {
 // CHECK-LABEL: @test_vmull_high_lane_s16_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
@@ -2704,10 +3056,12 @@ int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
 // CHECK-LABEL: @test_vmull_high_lane_s32_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
@@ -2717,10 +3071,12 @@ int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
 // CHECK-LABEL: @test_vmull_high_lane_u16_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) {
@@ -2730,10 +3086,12 @@ uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) {
 // CHECK-LABEL: @test_vmull_high_lane_u32_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) {
@@ -2742,10 +3100,12 @@ uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmull_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
@@ -2754,10 +3114,12 @@ int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmull_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
@@ -2766,10 +3128,12 @@ int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmull_laneq_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
@@ -2778,10 +3142,12 @@ uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmull_laneq_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
@@ -2791,10 +3157,12 @@ uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
 // CHECK-LABEL: @test_vmull_high_laneq_s16_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
@@ -2804,10 +3172,12 @@ int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
 // CHECK-LABEL: @test_vmull_high_laneq_s32_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
@@ -2817,10 +3187,12 @@ int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
 // CHECK-LABEL: @test_vmull_high_laneq_u16_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
@@ -2830,10 +3202,12 @@ uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
 // CHECK-LABEL: @test_vmull_high_laneq_u32_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
@@ -2842,11 +3216,13 @@ uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlal_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
 //
@@ -2856,11 +3232,13 @@ int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlal_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
 //
@@ -2871,11 +3249,13 @@ int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
 // CHECK-LABEL: @test_vqdmlal_high_lane_s16_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
 //
@@ -2886,11 +3266,13 @@ int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
 // CHECK-LABEL: @test_vqdmlal_high_lane_s32_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
 //
@@ -2900,11 +3282,13 @@ int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqdmlsl_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
 //
@@ -2914,11 +3298,13 @@ int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlsl_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
 //
@@ -2929,11 +3315,13 @@ int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
 // CHECK-LABEL: @test_vqdmlsl_high_lane_s16_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
 //
@@ -2944,11 +3332,13 @@ int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
 // CHECK-LABEL: @test_vqdmlsl_high_lane_s32_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
 //
@@ -2958,10 +3348,12 @@ int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqdmull_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
 //
@@ -2971,10 +3363,12 @@ int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqdmull_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
 //
@@ -2984,10 +3378,12 @@ int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqdmull_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
 //
@@ -2997,10 +3393,12 @@ int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqdmull_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
 //
@@ -3011,10 +3409,12 @@ int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
 // CHECK-LABEL: @test_vqdmull_high_lane_s16_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
 //
@@ -3025,10 +3425,12 @@ int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
 // CHECK-LABEL: @test_vqdmull_high_lane_s32_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
 //
@@ -3039,10 +3441,12 @@ int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
 // CHECK-LABEL: @test_vqdmull_high_laneq_s16_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
 //
@@ -3053,10 +3457,12 @@ int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
 // CHECK-LABEL: @test_vqdmull_high_laneq_s32_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
 //
@@ -3170,8 +3576,10 @@ int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmul_lane_f32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x float> [[MUL]]
 //
 float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) {
@@ -3180,8 +3588,10 @@ float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) {
 
 // CHECK-LABEL: @test_vmulq_lane_f32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x float> [[MUL]]
 //
 float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) {
@@ -3190,8 +3600,10 @@ float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) {
 
 // CHECK-LABEL: @test_vmul_laneq_f32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x float> [[MUL]]
 //
 float32x2_t test_vmul_laneq_f32_0(float32x2_t a, float32x4_t v) {
@@ -3215,8 +3627,10 @@ float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) {
 
 // CHECK-LABEL: @test_vmulq_laneq_f32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x float> [[MUL]]
 //
 float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) {
@@ -3225,8 +3639,10 @@ float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) {
 
 // CHECK-LABEL: @test_vmulq_laneq_f64_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x double> [[MUL]]
 //
 float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) {
@@ -3235,10 +3651,12 @@ float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) {
 
 // CHECK-LABEL: @test_vmulx_lane_f32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4
 // CHECK-NEXT:    ret <2 x float> [[VMULX2_I]]
 //
 float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) {
@@ -3247,10 +3665,12 @@ float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) {
 
 // CHECK-LABEL: @test_vmulxq_lane_f32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4
 // CHECK-NEXT:    ret <4 x float> [[VMULX2_I]]
 //
 float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) {
@@ -3259,10 +3679,12 @@ float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) {
 
 // CHECK-LABEL: @test_vmulxq_lane_f64_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <1 x double> [[V:%.*]], <1 x double> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4
 // CHECK-NEXT:    ret <2 x double> [[VMULX2_I]]
 //
 float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) {
@@ -3271,10 +3693,12 @@ float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) {
 
 // CHECK-LABEL: @test_vmulx_laneq_f32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4
 // CHECK-NEXT:    ret <2 x float> [[VMULX2_I]]
 //
 float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) {
@@ -3283,10 +3707,12 @@ float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) {
 
 // CHECK-LABEL: @test_vmulxq_laneq_f32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4
 // CHECK-NEXT:    ret <4 x float> [[VMULX2_I]]
 //
 float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) {
@@ -3295,10 +3721,12 @@ float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) {
 
 // CHECK-LABEL: @test_vmulxq_laneq_f64_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4
 // CHECK-NEXT:    ret <2 x double> [[VMULX2_I]]
 //
 float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
@@ -4461,8 +4889,10 @@ int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
 
 // CHECK-LABEL: @test_vmla_lane_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
 //
@@ -4472,8 +4902,10 @@ uint16x4_t test_vmla_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmlaq_lane_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
 //
@@ -4483,8 +4915,10 @@ uint16x8_t test_vmlaq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmla_lane_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
 //
@@ -4494,8 +4928,10 @@ uint32x2_t test_vmla_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmlaq_lane_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -4505,8 +4941,10 @@ uint32x4_t test_vmlaq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmla_laneq_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
 //
@@ -4516,8 +4954,10 @@ uint16x4_t test_vmla_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmlaq_laneq_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
 //
@@ -4527,8 +4967,10 @@ uint16x8_t test_vmlaq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmla_laneq_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
 //
@@ -4538,8 +4980,10 @@ uint32x2_t test_vmla_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vmlaq_laneq_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -4549,11 +4993,13 @@ uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlal_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
 //
@@ -4563,11 +5009,13 @@ int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqdmlal_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
 //
@@ -4578,11 +5026,13 @@ int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
 // CHECK-LABEL: @test_vqdmlal_high_laneq_s16_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
 //
@@ -4593,11 +5043,13 @@ int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
 // CHECK-LABEL: @test_vqdmlal_high_laneq_s32_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
 //
@@ -4607,8 +5059,10 @@ int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmls_lane_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
 //
@@ -4618,8 +5072,10 @@ uint16x4_t test_vmls_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmlsq_lane_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
 //
@@ -4629,8 +5085,10 @@ uint16x8_t test_vmlsq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmls_lane_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
 //
@@ -4640,8 +5098,10 @@ uint32x2_t test_vmls_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmlsq_lane_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -4651,8 +5111,10 @@ uint32x4_t test_vmlsq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmls_laneq_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
 //
@@ -4662,8 +5124,10 @@ uint16x4_t test_vmls_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmlsq_laneq_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
 //
@@ -4673,8 +5137,10 @@ uint16x8_t test_vmlsq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmls_laneq_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
 //
@@ -4684,8 +5150,10 @@ uint32x2_t test_vmls_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vmlsq_laneq_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -4695,11 +5163,13 @@ uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlsl_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
 //
@@ -4709,11 +5179,13 @@ int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqdmlsl_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
 //
@@ -4724,11 +5196,13 @@ int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
 //
@@ -4739,11 +5213,13 @@ int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
 //
@@ -4857,8 +5333,10 @@ int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmla_lane_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
 //
@@ -4868,8 +5346,10 @@ uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmlaq_lane_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
 //
@@ -4879,8 +5359,10 @@ uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmla_lane_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
 //
@@ -4890,8 +5372,10 @@ uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmlaq_lane_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -4901,8 +5385,10 @@ uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmla_laneq_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
 //
@@ -4912,8 +5398,10 @@ uint16x4_t test_vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmlaq_laneq_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
 //
@@ -4923,8 +5411,10 @@ uint16x8_t test_vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmla_laneq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
 //
@@ -4934,8 +5424,10 @@ uint32x2_t test_vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vmlaq_laneq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -4945,11 +5437,13 @@ uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlal_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
 //
@@ -4959,11 +5453,13 @@ int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqdmlal_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
 //
@@ -4974,11 +5470,13 @@ int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
 // CHECK-LABEL: @test_vqdmlal_high_laneq_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
 //
@@ -4989,11 +5487,13 @@ int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
 // CHECK-LABEL: @test_vqdmlal_high_laneq_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
 //
@@ -5003,8 +5503,10 @@ int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmls_lane_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
 //
@@ -5014,8 +5516,10 @@ uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmlsq_lane_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
 //
@@ -5025,8 +5529,10 @@ uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmls_lane_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
 //
@@ -5036,8 +5542,10 @@ uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmlsq_lane_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -5047,8 +5555,10 @@ uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmls_laneq_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
 //
@@ -5058,8 +5568,10 @@ uint16x4_t test_vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmlsq_laneq_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
 //
@@ -5069,8 +5581,10 @@ uint16x8_t test_vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmls_laneq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
 //
@@ -5080,8 +5594,10 @@ uint32x2_t test_vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vmlsq_laneq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -5091,11 +5607,13 @@ uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlsl_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
 //
@@ -5105,11 +5623,13 @@ int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqdmlsl_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
 //
@@ -5120,11 +5640,13 @@ int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
 //
@@ -5135,11 +5657,13 @@ int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
 //

diff  --git a/clang/test/CodeGen/aarch64-neon-fma.c b/clang/test/CodeGen/aarch64-neon-fma.c
index ae02bfbffb98..c2dd315ed9fc 100644
--- a/clang/test/CodeGen/aarch64-neon-fma.c
+++ b/clang/test/CodeGen/aarch64-neon-fma.c
@@ -69,144 +69,177 @@ float64x2_t test_vmlsq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
 }
 
 // CHECK-LABEL: define <2 x float> @test_vmla_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
-// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
-// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
-// CHECK:   ret <2 x float> [[ADD]]
+// CHECK:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK:    [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <2 x float> [[ADD]]
 float32x2_t test_vmla_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
   return vmla_lane_f32(a, b, v, 0);
 }
 
 // CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
-// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
-// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
-// CHECK:   ret <4 x float> [[ADD]]
+// CHECK:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK:    [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
   return vmlaq_lane_f32(a, b, v, 0);
 }
 
 // CHECK-LABEL: define <2 x float> @test_vmla_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
-// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
-// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
-// CHECK:   ret <2 x float> [[ADD]]
+// CHECK:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK:    [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <2 x float> [[ADD]]
 float32x2_t test_vmla_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
   return vmla_laneq_f32(a, b, v, 0);
 }
 
 // CHECK-LABEL: define <4 x float> @test_vmlaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
-// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
-// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
-// CHECK:   ret <4 x float> [[ADD]]
+// CHECK:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK:    [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
   return vmlaq_laneq_f32(a, b, v, 0);
 }
 
 // CHECK-LABEL: define <2 x float> @test_vmls_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
-// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
-// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
-// CHECK:   ret <2 x float> [[SUB]]
+// CHECK:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK:    [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <2 x float> [[SUB]]
 float32x2_t test_vmls_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
   return vmls_lane_f32(a, b, v, 0);
 }
 
 // CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
-// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
-// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
-// CHECK:   ret <4 x float> [[SUB]]
+// CHECK:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK:    [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
   return vmlsq_lane_f32(a, b, v, 0);
 }
 
 // CHECK-LABEL: define <2 x float> @test_vmls_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
-// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
-// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
-// CHECK:   ret <2 x float> [[SUB]]
+// CHECK:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK:    [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <2 x float> [[SUB]]
 float32x2_t test_vmls_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
   return vmls_laneq_f32(a, b, v, 0);
 }
 
 // CHECK-LABEL: define <4 x float> @test_vmlsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
-// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
-// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
-// CHECK:   ret <4 x float> [[SUB]]
+// CHECK:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK:    [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
   return vmlsq_laneq_f32(a, b, v, 0);
 }
 
 // CHECK-LABEL: define <2 x float> @test_vmla_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
-// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
-// CHECK:   ret <2 x float> [[ADD]]
+// CHECK:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:    [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <2 x float> [[ADD]]
 float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
   return vmla_lane_f32(a, b, v, 1);
 }
 
 // CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
-// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
-// CHECK:   ret <4 x float> [[ADD]]
+// CHECK:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:    [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
   return vmlaq_lane_f32(a, b, v, 1);
 }
 
 // CHECK-LABEL: define <2 x float> @test_vmla_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
-// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
-// CHECK:   ret <2 x float> [[ADD]]
+// CHECK:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK:    [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <2 x float> [[ADD]]
 float32x2_t test_vmla_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
   return vmla_laneq_f32(a, b, v, 3);
 }
 
 // CHECK-LABEL: define <4 x float> @test_vmlaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
-// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
-// CHECK:   ret <4 x float> [[ADD]]
+// CHECK:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:    [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
   return vmlaq_laneq_f32(a, b, v, 3);
 }
 
 // CHECK-LABEL: define <2 x float> @test_vmls_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
-// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
-// CHECK:   ret <2 x float> [[SUB]]
+// CHECK:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:    [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <2 x float> [[SUB]]
 float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
   return vmls_lane_f32(a, b, v, 1);
 }
 
 // CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
-// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
-// CHECK:   ret <4 x float> [[SUB]]
+// CHECK:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:    [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <4 x float> [[SUB]]
+//
 float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
   return vmlsq_lane_f32(a, b, v, 1);
 }
 // CHECK-LABEL: define <2 x float> @test_vmls_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
-// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
-// CHECK:   ret <2 x float> [[SUB]]
+// CHECK:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK:    [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <2 x float> [[SUB]]
 float32x2_t test_vmls_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
   return vmls_laneq_f32(a, b, v, 3);
 }
 
 // CHECK-LABEL: define <4 x float> @test_vmlsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
-// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
-// CHECK:   ret <4 x float> [[SUB]]
+// CHECK:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:    [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
   return vmlsq_laneq_f32(a, b, v, 3);
 }

diff  --git a/clang/test/CodeGen/aarch64-poly64.c b/clang/test/CodeGen/aarch64-poly64.c
index 8c4ef23bb7e1..b7fb1db9b0ff 100644
--- a/clang/test/CodeGen/aarch64-poly64.c
+++ b/clang/test/CodeGen/aarch64-poly64.c
@@ -150,22 +150,28 @@ poly64x2_t test_vmovq_n_p64(poly64_t a) {
 }
 
 // CHECK-LABEL: define <1 x i64> @test_vdup_lane_p64(<1 x i64> %vec) #0 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %vec, <1 x i64> %vec, <1 x i32> zeroinitializer
-// CHECK:   ret <1 x i64> [[SHUFFLE]]
+// CHECK:    [[TMP0:%.*]] = bitcast <1 x i64> [[VEC:%.*]] to <8 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:    [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer
+// CHECK:    ret <1 x i64> [[LANE]]
 poly64x1_t test_vdup_lane_p64(poly64x1_t vec) {
   return vdup_lane_p64(vec, 0);
 }
 
 // CHECK-LABEL: define <2 x i64> @test_vdupq_lane_p64(<1 x i64> %vec) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %vec, <1 x i64> %vec, <2 x i32> zeroinitializer
-// CHECK:   ret <2 x i64> [[SHUFFLE]]
+// CHECK:    [[TMP0:%.*]] = bitcast <1 x i64> [[VEC:%.*]] to <8 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:    [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK:    ret <2 x i64> [[LANE]]
 poly64x2_t test_vdupq_lane_p64(poly64x1_t vec) {
   return vdupq_lane_p64(vec, 0);
 }
 
 // CHECK-LABEL: define <2 x i64> @test_vdupq_laneq_p64(<2 x i64> %vec) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i64> %vec, <2 x i64> %vec, <2 x i32> <i32 1, i32 1>
-// CHECK:   ret <2 x i64> [[SHUFFLE]]
+// CHECK:    [[TMP0:%.*]] = bitcast <2 x i64> [[VEC:%.*]] to <16 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:    [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:    ret <2 x i64> [[LANE]]
 poly64x2_t test_vdupq_laneq_p64(poly64x2_t vec) {
   return vdupq_laneq_p64(vec, 1);
 }

diff  --git a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
index fc339faa6cdb..59b3dfec80cb 100644
--- a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
@@ -1086,32 +1086,40 @@ float16_t test_vfmsh_laneq_f16(float16_t a, float16_t b, float16x8_t c) {
 }
 
 // CHECK-LABEL: test_vmul_lane_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[MUL:%.*]]  = fmul <4 x half> %a, [[TMP0]]
+// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK: [[MUL:%.*]] = fmul <4 x half> [[A:%.*]], [[LANE]]
 // CHECK: ret <4 x half> [[MUL]]
 float16x4_t test_vmul_lane_f16(float16x4_t a, float16x4_t b) {
   return vmul_lane_f16(a, b, 3);
 }
 
 // CHECK-LABEL: test_vmulq_lane_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[MUL:%.*]]  = fmul <8 x half> %a, [[TMP0]]
+// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK: [[MUL:%.*]] = fmul <8 x half> [[A:%.*]], [[LANE]]
 // CHECK: ret <8 x half> [[MUL]]
 float16x8_t test_vmulq_lane_f16(float16x8_t a, float16x4_t b) {
-  return vmulq_lane_f16(a, b, 7);
+  return vmulq_lane_f16(a, b, 3);
 }
 
 // CHECK-LABEL: test_vmul_laneq_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <8 x half> %b, <8 x half> %b, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[MUL:%.*]]  = fmul <4 x half> %a, [[TMP0]]
+// CHECK: [[TMP0:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK: [[MUL:%.*]] = fmul <4 x half> [[A:%.*]], [[LANE]]
 // CHECK: ret <4 x half> [[MUL]]
 float16x4_t test_vmul_laneq_f16(float16x4_t a, float16x8_t b) {
   return vmul_laneq_f16(a, b, 7);
 }
 
 // CHECK-LABEL: test_vmulq_laneq_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <8 x half> %b, <8 x half> %b, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[MUL:%.*]]  = fmul <8 x half> %a, [[TMP0]]
+// CHECK: [[TMP0:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK: [[MUL:%.*]] = fmul <8 x half> [[A:%.*]], [[LANE]]
 // CHECK: ret <8 x half> [[MUL]]
 float16x8_t test_vmulq_laneq_f16(float16x8_t a, float16x8_t b) {
   return vmulq_laneq_f16(a, b, 7);
@@ -1165,33 +1173,49 @@ float16_t test_vmulh_laneq_f16(float16_t a, float16x8_t b) {
 }
 
 // CHECK-LABEL: test_vmulx_lane_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[MUL:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> %a, <4 x half> [[TMP0]])
-// CHECK: ret <4 x half> [[MUL]]
+// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK: [[TMP2:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8>
+// CHECK: [[TMP3:%.*]] = bitcast <4 x half> [[LANE]] to <8 x i8>
+// CHECK: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]]) #4
+// CHECK: ret <4 x half> [[VMULX2_I]]
 float16x4_t test_vmulx_lane_f16(float16x4_t a, float16x4_t b) {
   return vmulx_lane_f16(a, b, 3);
 }
 
 // CHECK-LABEL: test_vmulxq_lane_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[MUL:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> %a, <8 x half> [[TMP0]])
-// CHECK: ret <8 x half> [[MUL]]
+// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK: [[TMP2:%.*]] = bitcast <8 x half> [[A:%.*]] to <16 x i8>
+// CHECK: [[TMP3:%.*]] = bitcast <8 x half> [[LANE]] to <16 x i8>
+// CHECK: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]]) #4
+// CHECK: ret <8 x half> [[VMULX2_I]]
 float16x8_t test_vmulxq_lane_f16(float16x8_t a, float16x4_t b) {
-  return vmulxq_lane_f16(a, b, 7);
+  return vmulxq_lane_f16(a, b, 3);
 }
 
 // CHECK-LABEL: test_vmulx_laneq_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <8 x half> %b, <8 x half> %b, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[MUL:%.*]]  = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> %a, <4 x half> [[TMP0]])
-// CHECK: ret <4 x half> [[MUL]]
+// CHECK: [[TMP0:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK: [[TMP2:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8>
+// CHECK: [[TMP3:%.*]] = bitcast <4 x half> [[LANE]] to <8 x i8>
+// CHECK: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]]) #4
+// CHECK: ret <4 x half> [[VMULX2_I]]
 float16x4_t test_vmulx_laneq_f16(float16x4_t a, float16x8_t b) {
   return vmulx_laneq_f16(a, b, 7);
 }
 
 // CHECK-LABEL: test_vmulxq_laneq_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <8 x half> %b, <8 x half> %b, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[MUL:%.*]]  = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> %a, <8 x half> [[TMP0]])
-// CHECK: ret <8 x half> [[MUL]]
+// CHECK: [[TMP0:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK: [[TMP2:%.*]] = bitcast <8 x half> [[A:%.*]] to <16 x i8>
+// CHECK: [[TMP3:%.*]] = bitcast <8 x half> [[LANE]] to <16 x i8>
+// CHECK: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]]) #4
+// CHECK: ret <8 x half> [[VMULX2_I]]
 float16x8_t test_vmulxq_laneq_f16(float16x8_t a, float16x8_t b) {
   return vmulxq_laneq_f16(a, b, 7);
 }
@@ -1473,17 +1497,21 @@ float16x8_t test_vdupq_n_f16(float16_t a) {
 }
 
 // CHECK-LABEL: test_vdup_lane_f16
-// CHECK:   [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   ret <4 x half> [[SHFL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x half> [[LANE]]
 float16x4_t test_vdup_lane_f16(float16x4_t a) {
   return vdup_lane_f16(a, 3);
 }
 
 // CHECK-LABEL: test_vdupq_lane_f16
-// CHECK:   [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK:   ret <8 x half> [[SHFL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <8 x half> [[LANE]]
 float16x8_t test_vdupq_lane_f16(float16x4_t a) {
-  return vdupq_lane_f16(a, 7);
+  return vdupq_lane_f16(a, 3);
 }
 
 // CHECK-LABEL: @test_vext_f16(

diff  --git a/clang/test/CodeGen/arm-neon-range-checks.c b/clang/test/CodeGen/arm-neon-range-checks.c
new file mode 100644
index 000000000000..488dad6d59ac
--- /dev/null
+++ b/clang/test/CodeGen/arm-neon-range-checks.c
@@ -0,0 +1,424 @@
+// RUN: %clang_cc1 -triple arm64-none-eabi -target-feature +neon -target-feature +dotprod -target-feature +v8.1a -verify %s
+// RUN: %clang_cc1 -triple armv8.1a-none-eabi -target-feature +neon -target-feature +dotprod -target-feature +v8.1a -verify %s
+
+#include <arm_neon.h>
+
+void test_vdot_lane(int32x2_t r, int8x8_t a, int8x8_t b) {
+  vdot_lane_s32(r, a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vdot_lane_s32(r, a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vdot_lane_s32(r, a, b, 0);
+  vdot_lane_s32(r, a, b, 1);
+}
+
+void test_vdotq_lane(int32x4_t r, int8x16_t a, int8x8_t b) {
+  vdotq_lane_s32(r, a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vdotq_lane_s32(r, a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vdotq_lane_s32(r, a, b, 0);
+  vdotq_lane_s32(r, a, b, 1);
+}
+
+#if defined(__aarch64__)
+void test_vdot_laneq(int32x2_t r, int8x8_t a, int8x16_t b) {
+  vdot_laneq_s32(r, a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vdot_laneq_s32(r, a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vdot_laneq_s32(r, a, b, 0);
+  vdot_laneq_s32(r, a, b, 3);
+}
+
+void test_vdotq_laneq(int32x4_t r, int8x16_t a, int8x16_t b) {
+  vdotq_laneq_s32(r, a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vdotq_laneq_s32(r, a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vdotq_laneq_s32(r, a, b, 0);
+  vdotq_laneq_s32(r, a, b, 3);
+}
+#endif
+
+void test_vdup_lane(int32x2_t v) {
+  vdup_lane_s32(v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vdup_lane_s32(v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vdup_lane_s32(v, 0);
+  vdup_lane_s32(v, 1);
+}
+
+void test_vdupq_lane(int32x2_t v) {
+  vdupq_lane_s32(v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vdupq_lane_s32(v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vdupq_lane_s32(v, 0);
+  vdupq_lane_s32(v, 1);
+}
+
+#if defined(__aarch64__)
+void test_vdup_laneq(int32x4_t v) {
+  vdup_laneq_s32(v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vdup_laneq_s32(v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vdup_laneq_s32(v, 0);
+  vdup_laneq_s32(v, 3);
+}
+
+void test_vdupq_laneq(int32x4_t v) {
+  vdupq_laneq_s32(v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vdupq_laneq_s32(v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vdupq_laneq_s32(v, 0);
+  vdupq_laneq_s32(v, 3);
+}
+#endif
+
+void test_vmla_lane(int32x2_t a, int32x2_t b, int32x2_t v) {
+  vmla_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vmla_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vmla_lane_s32(a, b, v, 0);
+  vmla_lane_s32(a, b, v, 1);
+}
+
+void test_vmlaq_lane(int32x4_t a, int32x4_t b, int32x2_t v) {
+  vmlaq_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vmlaq_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vmlaq_lane_s32(a, b, v, 0);
+  vmlaq_lane_s32(a, b, v, 1);
+}
+
+#if defined(__aarch64__)
+void test_vmla_laneq(int32x2_t a, int32x2_t b, int32x4_t v) {
+  vmla_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vmla_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vmla_laneq_s32(a, b, v, 0);
+  vmla_laneq_s32(a, b, v, 3);
+}
+
+void test_vmlaq_laneq(int32x4_t a, int32x4_t b, int32x4_t v) {
+  vmlaq_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vmlaq_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vmlaq_laneq_s32(a, b, v, 0);
+  vmlaq_laneq_s32(a, b, v, 3);
+}
+
+void test_vmlal_high_lane(int64x2_t a, int32x4_t b, int32x2_t v) {
+  vmlal_high_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vmlal_high_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vmlal_high_lane_s32(a, b, v, 0);
+  vmlal_high_lane_s32(a, b, v, 1);
+}
+
+void test_vmlal_high_laneq(int64x2_t a, int32x4_t b, int32x4_t v) {
+  vmlal_high_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vmlal_high_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vmlal_high_laneq_s32(a, b, v, 0);
+  vmlal_high_laneq_s32(a, b, v, 3);
+}
+#endif
+
+void test_vmlal_lane(int64x2_t a, int32x2_t b, int32x2_t v) {
+  vmlal_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vmlal_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vmlal_lane_s32(a, b, v, 0);
+  vmlal_lane_s32(a, b, v, 1);
+}
+
+#if defined(__aarch64__)
+void test_vmlal_laneq(int64x2_t a, int32x2_t b, int32x4_t v) {
+  vmlal_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vmlal_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vmlal_laneq_s32(a, b, v, 0);
+  vmlal_laneq_s32(a, b, v, 3);
+}
+#endif
+
+void test_vmls_lane(int32x2_t a, int32x2_t b, int32x2_t v) {
+  vmls_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vmls_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vmls_lane_s32(a, b, v, 0);
+  vmls_lane_s32(a, b, v, 1);
+}
+
+void test_vmlsq_lane(int32x4_t a, int32x4_t b, int32x2_t v) {
+  vmlsq_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vmlsq_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vmlsq_lane_s32(a, b, v, 0);
+  vmlsq_lane_s32(a, b, v, 1);
+}
+
+#if defined(__aarch64__)
+void test_vmls_laneq(int32x2_t a, int32x2_t b, int32x4_t v) {
+  vmls_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vmls_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vmls_laneq_s32(a, b, v, 0);
+  vmls_laneq_s32(a, b, v, 3);
+}
+
+void test_vmlsq_laneq(int32x4_t a, int32x4_t b, int32x4_t v) {
+  vmlsq_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vmlsq_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vmlsq_laneq_s32(a, b, v, 0);
+  vmlsq_laneq_s32(a, b, v, 3);
+}
+
+void test_vmlsl_high_lane(int64x2_t a, int32x4_t b, int32x2_t v) {
+  vmlsl_high_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vmlsl_high_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vmlsl_high_lane_s32(a, b, v, 0);
+  vmlsl_high_lane_s32(a, b, v, 1);
+}
+
+void test_vmlsl_high_laneq(int64x2_t a, int32x4_t b, int32x4_t v) {
+  vmlsl_high_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vmlsl_high_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vmlsl_high_laneq_s32(a, b, v, 0);
+  vmlsl_high_laneq_s32(a, b, v, 3);
+}
+#endif
+
+void test_vmlsl_lane(int64x2_t a, int32x2_t b, int32x2_t v) {
+  vmlsl_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vmlsl_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vmlsl_lane_s32(a, b, v, 0);
+  vmlsl_lane_s32(a, b, v, 1);
+}
+
+#if defined(__aarch64__)
+void test_vmlsl_laneq(int64x2_t a, int32x2_t b, int32x4_t v) {
+  vmlsl_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vmlsl_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vmlsl_laneq_s32(a, b, v, 0);
+  vmlsl_laneq_s32(a, b, v, 3);
+}
+#endif
+
+void test_vmull_lane(int32x2_t a, int32x2_t b) {
+  vmull_lane_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vmull_lane_s32(a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vmull_lane_s32(a, b, 0);
+  vmull_lane_s32(a, b, 1);
+}
+
+#if defined(__aarch64__)
+void test_vmull_laneq(int32x2_t a, int32x4_t b) {
+  vmull_laneq_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vmull_laneq_s32(a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vmull_laneq_s32(a, b, 0);
+  vmull_laneq_s32(a, b, 3);
+}
+
+void test_vmull_high_lane(int32x4_t a, int32x2_t b) {
+  vmull_high_lane_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vmull_high_lane_s32(a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vmull_high_lane_s32(a, b, 0);
+  vmull_high_lane_s32(a, b, 1);
+}
+
+void test_vmull_high_laneq(int32x4_t a, int32x4_t b) {
+  vmull_high_laneq_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vmull_high_laneq_s32(a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vmull_high_laneq_s32(a, b, 0);
+  vmull_high_laneq_s32(a, b, 3);
+}
+
+void test_vqdmlal_high_lane(int64x2_t a, int32x4_t b, int32x2_t v) {
+  vqdmlal_high_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vqdmlal_high_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vqdmlal_high_lane_s32(a, b, v, 0);
+  vqdmlal_high_lane_s32(a, b, v, 1);
+}
+
+void test_vqdmlal_high_laneq(int64x2_t a, int32x4_t b, int32x4_t v) {
+  vqdmlal_high_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqdmlal_high_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqdmlal_high_laneq_s32(a, b, v, 0);
+  vqdmlal_high_laneq_s32(a, b, v, 3);
+}
+#endif
+
+void test_vqdmlal_lane(int64x2_t a, int32x2_t b, int32x2_t v) {
+  vqdmlal_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vqdmlal_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vqdmlal_lane_s32(a, b, v, 0);
+  vqdmlal_lane_s32(a, b, v, 1);
+}
+
+#if defined(__aarch64__)
+void test_vqdmlal_laneq(int64x2_t a, int32x2_t b, int32x4_t v) {
+  vqdmlal_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqdmlal_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqdmlal_laneq_s32(a, b, v, 0);
+  vqdmlal_laneq_s32(a, b, v, 3);
+}
+
+void test_vqdmlsl_high_lane(int64x2_t a, int32x4_t b, int32x2_t v) {
+  vqdmlsl_high_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vqdmlsl_high_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vqdmlsl_high_lane_s32(a, b, v, 0);
+  vqdmlsl_high_lane_s32(a, b, v, 1);
+}
+
+void test_vqdmlsl_high_laneq(int64x2_t a, int32x4_t b, int32x4_t v) {
+  vqdmlsl_high_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqdmlsl_high_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqdmlsl_high_laneq_s32(a, b, v, 0);
+  vqdmlsl_high_laneq_s32(a, b, v, 3);
+}
+#endif
+
+void test_vqdmlsl_lane(int64x2_t a, int32x2_t b, int32x2_t v) {
+  vqdmlsl_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vqdmlsl_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vqdmlsl_lane_s32(a, b, v, 0);
+  vqdmlsl_lane_s32(a, b, v, 1);
+}
+
+#if defined(__aarch64__)
+void test_vqdmlsl_laneq(int64x2_t a, int32x2_t b, int32x4_t v) {
+  vqdmlsl_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqdmlsl_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqdmlsl_laneq_s32(a, b, v, 0);
+  vqdmlsl_laneq_s32(a, b, v, 3);
+}
+#endif
+
+void test_vqdmulh_lane(int32x2_t a, int32x2_t b) {
+  vqdmulh_lane_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vqdmulh_lane_s32(a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vqdmulh_lane_s32(a, b, 0);
+  vqdmulh_lane_s32(a, b, 1);
+}
+
+void test_vqdmulhq_lane(int32x4_t a, int32x2_t b) {
+  vqdmulhq_lane_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vqdmulhq_lane_s32(a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vqdmulhq_lane_s32(a, b, 0);
+  vqdmulhq_lane_s32(a, b, 1);
+}
+
+#if defined(__aarch64__)
+void test_vqdmulh_laneq(int32x2_t a, int32x4_t b) {
+  vqdmulh_laneq_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqdmulh_laneq_s32(a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqdmulh_laneq_s32(a, b, 0);
+  vqdmulh_laneq_s32(a, b, 3);
+}
+
+void test_vqdmulhq_laneq(int32x4_t a, int32x4_t b) {
+  vqdmulhq_laneq_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqdmulhq_laneq_s32(a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqdmulhq_laneq_s32(a, b, 0);
+  vqdmulhq_laneq_s32(a, b, 3);
+}
+
+void test_vqdmull_high_lane(int32x4_t a, int32x2_t b) {
+  vqdmull_high_lane_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vqdmull_high_lane_s32(a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vqdmull_high_lane_s32(a, b, 0);
+  vqdmull_high_lane_s32(a, b, 1);
+}
+
+void test_vqdmull_high_laneq(int32x4_t a, int32x4_t b) {
+  vqdmull_high_laneq_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqdmull_high_laneq_s32(a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqdmull_high_laneq_s32(a, b, 0);
+  vqdmull_high_laneq_s32(a, b, 3);
+}
+#endif
+
+void test_vqdmull_lane(int32x2_t a, int32x2_t v) {
+  vqdmull_lane_s32(a, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vqdmull_lane_s32(a, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vqdmull_lane_s32(a, v, 0);
+  vqdmull_lane_s32(a, v, 1);
+}
+
+#if defined(__aarch64__)
+void test_vqdmull_laneq(int32x2_t a, int32x4_t v) {
+  vqdmull_laneq_s32(a, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqdmull_laneq_s32(a, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqdmull_laneq_s32(a, v, 0);
+  vqdmull_laneq_s32(a, v, 3);
+}
+#endif
+
+void test_vqrdmlah_lane(int32x2_t a, int32x2_t b, int32x2_t v) {
+  vqrdmlah_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vqrdmlah_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vqrdmlah_lane_s32(a, b, v, 0);
+  vqrdmlah_lane_s32(a, b, v, 1);
+}
+
+void test_vqrdmlahq_lane(int32x4_t a, int32x4_t b, int32x2_t v) {
+  vqrdmlahq_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vqrdmlahq_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vqrdmlahq_lane_s32(a, b, v, 0);
+  vqrdmlahq_lane_s32(a, b, v, 1);
+}
+
+#if defined(__aarch64__)
+void test_vqrdmlah_laneq(int32x2_t a, int32x2_t b, int32x4_t v) {
+  vqrdmlah_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqrdmlah_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqrdmlah_laneq_s32(a, b, v, 0);
+  vqrdmlah_laneq_s32(a, b, v, 3);
+}
+
+void test_vqrdmlahq_laneq(int32x4_t a, int32x4_t b, int32x4_t v) {
+  vqrdmlahq_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqrdmlahq_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqrdmlahq_laneq_s32(a, b, v, 0);
+  vqrdmlahq_laneq_s32(a, b, v, 3);
+}
+#endif
+
+void test_vqrdmlsh_lane(int32x2_t a, int32x2_t b, int32x2_t v) {
+  vqrdmlsh_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vqrdmlsh_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vqrdmlsh_lane_s32(a, b, v, 0);
+  vqrdmlsh_lane_s32(a, b, v, 1);
+}
+
+void test_vqrdmlshq_lane(int32x4_t a, int32x4_t b, int32x2_t v) {
+  vqrdmlshq_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vqrdmlshq_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vqrdmlshq_lane_s32(a, b, v, 0);
+  vqrdmlshq_lane_s32(a, b, v, 1);
+}
+
+#if defined(__aarch64__)
+void test_vqrdmlsh_laneq(int32x2_t a, int32x2_t b, int32x4_t v) {
+  vqrdmlsh_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqrdmlsh_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqrdmlsh_laneq_s32(a, b, v, 0);
+  vqrdmlsh_laneq_s32(a, b, v, 3);
+}
+
+void test_vqrdmlshq_laneq(int32x4_t a, int32x4_t b, int32x4_t v) {
+  vqrdmlshq_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqrdmlshq_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqrdmlshq_laneq_s32(a, b, v, 0);
+  vqrdmlshq_laneq_s32(a, b, v, 3);
+}
+#endif
+
+void test_vqrdmulh_lane(int32x2_t a, int32x2_t v) {
+  vqrdmulh_lane_s32(a, v,  -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vqrdmulh_lane_s32(a, v,  2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vqrdmulh_lane_s32(a, v,  0);
+  vqrdmulh_lane_s32(a, v,  1);
+}
+
+void test_vqrdmulhq_lane(int32x4_t a, int32x2_t v) {
+  vqrdmulhq_lane_s32(a, v,  -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vqrdmulhq_lane_s32(a, v,  2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vqrdmulhq_lane_s32(a, v,  0);
+  vqrdmulhq_lane_s32(a, v,  1);
+}
+
+#if defined(__aarch64__)
+void test_vqrdmulh_laneq(int32x2_t a, int32x4_t v) {
+  vqrdmulh_laneq_s32(a, v,  -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqrdmulh_laneq_s32(a, v,  4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqrdmulh_laneq_s32(a, v,  0);
+  vqrdmulh_laneq_s32(a, v,  3);
+}
+
+void test_vqrdmulhq_laneq(int32x4_t a, int32x4_t v) {
+  vqrdmulhq_laneq_s32(a, v,  -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqrdmulhq_laneq_s32(a, v,  4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqrdmulhq_laneq_s32(a, v,  0);
+  vqrdmulhq_laneq_s32(a, v,  3);
+}
+#endif

diff  --git a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c
index 4b48ba01c4bc..a0896c7aa4f2 100644
--- a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c
+++ b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c
@@ -773,19 +773,23 @@ float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
 }
 
 // CHECK-LABEL: test_vmul_lane_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[MUL:%.*]]  = fmul <4 x half> %a, [[TMP0]]
+// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK: [[MUL:%.*]] = fmul <4 x half> [[A:%.*]], [[LANE]]
 // CHECK: ret <4 x half> [[MUL]]
 float16x4_t test_vmul_lane_f16(float16x4_t a, float16x4_t b) {
   return vmul_lane_f16(a, b, 3);
 }
 
 // CHECK-LABEL: test_vmulq_lane_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[MUL:%.*]]  = fmul <8 x half> %a, [[TMP0]]
+// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK: [[MUL:%.*]] = fmul <8 x half> [[A:%.*]], [[LANE]]
 // CHECK: ret <8 x half> [[MUL]]
 float16x8_t test_vmulq_lane_f16(float16x8_t a, float16x4_t b) {
-  return vmulq_lane_f16(a, b, 7);
+  return vmulq_lane_f16(a, b, 3);
 }
 
 // CHECK-LABEL: test_vmul_n_f16
@@ -939,17 +943,21 @@ float16x8_t test_vdupq_n_f16(float16_t a) {
 }
 
 // CHECK-LABEL: test_vdup_lane_f16
-// CHECK:   [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   ret <4 x half> [[SHFL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x half> [[LANE]]
 float16x4_t test_vdup_lane_f16(float16x4_t a) {
   return vdup_lane_f16(a, 3);
 }
 
 // CHECK-LABEL: test_vdupq_lane_f16
-// CHECK:   [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK:   ret <8 x half> [[SHFL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <8 x half> [[LANE]]
 float16x8_t test_vdupq_lane_f16(float16x4_t a) {
-  return vdupq_lane_f16(a, 7);
+  return vdupq_lane_f16(a, 3);
 }
 
 // CHECK-LABEL: @test_vext_f16(

diff  --git a/clang/test/CodeGen/arm64_vdupq_n_f64.c b/clang/test/CodeGen/arm64_vdupq_n_f64.c
index 24c57c4f0de3..5c6f61e7acf9 100644
--- a/clang/test/CodeGen/arm64_vdupq_n_f64.c
+++ b/clang/test/CodeGen/arm64_vdupq_n_f64.c
@@ -28,7 +28,9 @@ float32x4_t test_vdupq_n_f32(float32_t w) {
 // this was in <rdar://problem/11778405>, but had already been implemented,
 // test anyway
 // CHECK-LABEL: define <2 x double> @test_vdupq_lane_f64(<1 x double> %V) #0 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %V, <1 x double> %V, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %V to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK:   ret <2 x double> [[SHUFFLE]]
 float64x2_t test_vdupq_lane_f64(float64x1_t V) {
     return vdupq_lane_f64(V, 0);

diff  --git a/clang/test/CodeGen/arm_neon_intrinsics.c b/clang/test/CodeGen/arm_neon_intrinsics.c
index 9f1a64554155..2cdbfb9ba26b 100644
--- a/clang/test/CodeGen/arm_neon_intrinsics.c
+++ b/clang/test/CodeGen/arm_neon_intrinsics.c
@@ -2419,15 +2419,19 @@ uint8x8_t test_vdup_lane_u8(uint8x8_t a) {
 }
 
 // CHECK-LABEL: @test_vdup_lane_u16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   ret <4 x i16> [[SHUFFLE]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x i16> [[LANE]]
 uint16x4_t test_vdup_lane_u16(uint16x4_t a) {
   return vdup_lane_u16(a, 3);
 }
 
 // CHECK-LABEL: @test_vdup_lane_u32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 1>
-// CHECK:   ret <2 x i32> [[SHUFFLE]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:   ret <2 x i32> [[LANE]]
 uint32x2_t test_vdup_lane_u32(uint32x2_t a) {
   return vdup_lane_u32(a, 1);
 }
@@ -2440,15 +2444,19 @@ int8x8_t test_vdup_lane_s8(int8x8_t a) {
 }
 
 // CHECK-LABEL: @test_vdup_lane_s16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   ret <4 x i16> [[SHUFFLE]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x i16> [[LANE]]
 int16x4_t test_vdup_lane_s16(int16x4_t a) {
   return vdup_lane_s16(a, 3);
 }
 
 // CHECK-LABEL: @test_vdup_lane_s32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 1>
-// CHECK:   ret <2 x i32> [[SHUFFLE]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:   ret <2 x i32> [[LANE]]
 int32x2_t test_vdup_lane_s32(int32x2_t a) {
   return vdup_lane_s32(a, 1);
 }
@@ -2461,15 +2469,19 @@ poly8x8_t test_vdup_lane_p8(poly8x8_t a) {
 }
 
 // CHECK-LABEL: @test_vdup_lane_p16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   ret <4 x i16> [[SHUFFLE]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x i16> [[LANE]]
 poly16x4_t test_vdup_lane_p16(poly16x4_t a) {
   return vdup_lane_p16(a, 3);
 }
 
 // CHECK-LABEL: @test_vdup_lane_f32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> <i32 1, i32 1>
-// CHECK:   ret <2 x float> [[SHUFFLE]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:   ret <2 x float> [[LANE]]
 float32x2_t test_vdup_lane_f32(float32x2_t a) {
   return vdup_lane_f32(a, 1);
 }
@@ -2482,15 +2494,19 @@ uint8x16_t test_vdupq_lane_u8(uint8x8_t a) {
 }
 
 // CHECK-LABEL: @test_vdupq_lane_u16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK:   ret <8 x i16> [[SHUFFLE]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <8 x i16> [[LANE]]
 uint16x8_t test_vdupq_lane_u16(uint16x4_t a) {
   return vdupq_lane_u16(a, 3);
 }
 
 // CHECK-LABEL: @test_vdupq_lane_u32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   ret <4 x i32> [[SHUFFLE]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[LANE]]
 uint32x4_t test_vdupq_lane_u32(uint32x2_t a) {
   return vdupq_lane_u32(a, 1);
 }
@@ -2503,15 +2519,19 @@ int8x16_t test_vdupq_lane_s8(int8x8_t a) {
 }
 
 // CHECK-LABEL: @test_vdupq_lane_s16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK:   ret <8 x i16> [[SHUFFLE]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <8 x i16> [[LANE]]
 int16x8_t test_vdupq_lane_s16(int16x4_t a) {
   return vdupq_lane_s16(a, 3);
 }
 
 // CHECK-LABEL: @test_vdupq_lane_s32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   ret <4 x i32> [[SHUFFLE]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[LANE]]
 int32x4_t test_vdupq_lane_s32(int32x2_t a) {
   return vdupq_lane_s32(a, 1);
 }
@@ -2524,43 +2544,55 @@ poly8x16_t test_vdupq_lane_p8(poly8x8_t a) {
 }
 
 // CHECK-LABEL: @test_vdupq_lane_p16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK:   ret <8 x i16> [[SHUFFLE]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <8 x i16> [[LANE]]
 poly16x8_t test_vdupq_lane_p16(poly16x4_t a) {
   return vdupq_lane_p16(a, 3);
 }
 
 // CHECK-LABEL: @test_vdupq_lane_f32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   ret <4 x float> [[SHUFFLE]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x float> [[LANE]]
 float32x4_t test_vdupq_lane_f32(float32x2_t a) {
   return vdupq_lane_f32(a, 1);
 }
 
 // CHECK-LABEL: @test_vdup_lane_s64(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer
-// CHECK:   ret <1 x i64> [[SHUFFLE]]
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[LANE]]
 int64x1_t test_vdup_lane_s64(int64x1_t a) {
   return vdup_lane_s64(a, 0);
 }
 
 // CHECK-LABEL: @test_vdup_lane_u64(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer
-// CHECK:   ret <1 x i64> [[SHUFFLE]]
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[LANE]]
 uint64x1_t test_vdup_lane_u64(uint64x1_t a) {
   return vdup_lane_u64(a, 0);
 }
 
 // CHECK-LABEL: @test_vdupq_lane_s64(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer
-// CHECK:   ret <2 x i64> [[SHUFFLE]]
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[LANE]]
 int64x2_t test_vdupq_lane_s64(int64x1_t a) {
   return vdupq_lane_s64(a, 0);
 }
 
 // CHECK-LABEL: @test_vdupq_lane_u64(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer
-// CHECK:   ret <2 x i64> [[SHUFFLE]]
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[LANE]]
 uint64x2_t test_vdupq_lane_u64(uint64x1_t a) {
   return vdupq_lane_u64(a, 0);
 }
@@ -7077,44 +7109,52 @@ uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
 }
 
 // CHECK-LABEL: @test_vmlal_lane_s16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
-// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #8
+// CHECK:   [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlal_lane_s16(a, b, c, 3);
 }
 
 // CHECK-LABEL: @test_vmlal_lane_s32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
-// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #8
+// CHECK:   [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlal_lane_s32(a, b, c, 1);
 }
 
 // CHECK-LABEL: @test_vmlal_lane_u16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
-// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #8
+// CHECK:   [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[ADD]]
 uint32x4_t test_vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlal_lane_u16(a, b, c, 3);
 }
 
 // CHECK-LABEL: @test_vmlal_lane_u32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
-// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #8
+// CHECK:   [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[ADD]]
 uint64x2_t test_vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlal_lane_u32(a, b, c, 1);
@@ -7173,90 +7213,110 @@ uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
 }
 
 // CHECK-LABEL: @test_vmla_lane_s16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
-// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK:   ret <4 x i16> [[ADD]]
 int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vmla_lane_s16(a, b, c, 3);
 }
 
 // CHECK-LABEL: @test_vmla_lane_s32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
-// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK:   ret <2 x i32> [[ADD]]
 int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vmla_lane_s32(a, b, c, 1);
 }
 
 // CHECK-LABEL: @test_vmla_lane_u16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
-// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK:   ret <4 x i16> [[ADD]]
 uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmla_lane_u16(a, b, c, 3);
 }
 
 // CHECK-LABEL: @test_vmla_lane_u32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
-// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK:   ret <2 x i32> [[ADD]]
 uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmla_lane_u32(a, b, c, 1);
 }
 
 // CHECK-LABEL: @test_vmla_lane_f32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> <i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
-// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
+// CHECK:   [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]]
 // CHECK:   ret <2 x float> [[ADD]]
 float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vmla_lane_f32(a, b, c, 1);
 }
 
 // CHECK-LABEL: @test_vmlaq_lane_s16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
-// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK:   ret <8 x i16> [[ADD]]
 int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
   return vmlaq_lane_s16(a, b, c, 3);
 }
 
 // CHECK-LABEL: @test_vmlaq_lane_s32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
-// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
   return vmlaq_lane_s32(a, b, c, 1);
 }
 
 // CHECK-LABEL: @test_vmlaq_lane_u16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
-// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK:   ret <8 x i16> [[ADD]]
 uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) {
   return vmlaq_lane_u16(a, b, c, 3);
 }
 
 // CHECK-LABEL: @test_vmlaq_lane_u32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
-// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK:   ret <4 x i32> [[ADD]]
 uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) {
   return vmlaq_lane_u32(a, b, c, 1);
 }
 
 // CHECK-LABEL: @test_vmlaq_lane_f32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
-// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
+// CHECK:   [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]]
 // CHECK:   ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) {
   return vmlaq_lane_f32(a, b, c, 1);
@@ -7553,44 +7613,52 @@ uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
 }
 
 // CHECK-LABEL: @test_vmlsl_lane_s16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
-// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #8
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlsl_lane_s16(a, b, c, 3);
 }
 
 // CHECK-LABEL: @test_vmlsl_lane_s32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
-// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #8
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlsl_lane_s32(a, b, c, 1);
 }
 
 // CHECK-LABEL: @test_vmlsl_lane_u16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
-// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #8
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK:   ret <4 x i32> [[SUB]]
 uint32x4_t test_vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlsl_lane_u16(a, b, c, 3);
 }
 
 // CHECK-LABEL: @test_vmlsl_lane_u32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
-// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #8
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK:   ret <2 x i64> [[SUB]]
 uint64x2_t test_vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlsl_lane_u32(a, b, c, 1);
@@ -7649,90 +7717,110 @@ uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
 }
 
 // CHECK-LABEL: @test_vmls_lane_s16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
-// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK:   ret <4 x i16> [[SUB]]
 int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vmls_lane_s16(a, b, c, 3);
 }
 
 // CHECK-LABEL: @test_vmls_lane_s32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
-// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK:   ret <2 x i32> [[SUB]]
 int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vmls_lane_s32(a, b, c, 1);
 }
 
 // CHECK-LABEL: @test_vmls_lane_u16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
-// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK:   ret <4 x i16> [[SUB]]
 uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmls_lane_u16(a, b, c, 3);
 }
 
 // CHECK-LABEL: @test_vmls_lane_u32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
-// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK:   ret <2 x i32> [[SUB]]
 uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmls_lane_u32(a, b, c, 1);
 }
 
 // CHECK-LABEL: @test_vmls_lane_f32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> <i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
-// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]]
 // CHECK:   ret <2 x float> [[SUB]]
 float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vmls_lane_f32(a, b, c, 1);
 }
 
 // CHECK-LABEL: @test_vmlsq_lane_s16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
-// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK:   ret <8 x i16> [[SUB]]
 int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
   return vmlsq_lane_s16(a, b, c, 3);
 }
 
 // CHECK-LABEL: @test_vmlsq_lane_s32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
-// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
   return vmlsq_lane_s32(a, b, c, 1);
 }
 
 // CHECK-LABEL: @test_vmlsq_lane_u16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
-// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK:   ret <8 x i16> [[SUB]]
 uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) {
   return vmlsq_lane_u16(a, b, c, 3);
 }
 
 // CHECK-LABEL: @test_vmlsq_lane_u32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
-// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK:   ret <4 x i32> [[SUB]]
 uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) {
   return vmlsq_lane_u32(a, b, c, 1);
 }
 
 // CHECK-LABEL: @test_vmlsq_lane_f32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
-// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]]
 // CHECK:   ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) {
   return vmlsq_lane_f32(a, b, c, 1);
@@ -8404,40 +8492,48 @@ poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) {
 }
 
 // CHECK-LABEL: @test_vmull_lane_s16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #8
 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t b) {
   return vmull_lane_s16(a, b, 3);
 }
 
 // CHECK-LABEL: @test_vmull_lane_s32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #8
 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t b) {
   return vmull_lane_s32(a, b, 1);
 }
 
 // CHECK-LABEL: @test_vmull_lane_u16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #8
 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t b) {
   return vmull_lane_u16(a, b, 3);
 }
 
 // CHECK-LABEL: @test_vmull_lane_u32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #8
 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t b) {
   return vmull_lane_u32(a, b, 1);
@@ -8506,80 +8602,100 @@ poly8x16_t test_vmulq_p8(poly8x16_t a, poly8x16_t b) {
 }
 
 // CHECK-LABEL: @test_vmul_lane_s16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
 // CHECK:   ret <4 x i16> [[MUL]]
 int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t b) {
   return vmul_lane_s16(a, b, 3);
 }
 
 // CHECK-LABEL: @test_vmul_lane_s32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
 // CHECK:   ret <2 x i32> [[MUL]]
 int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t b) {
   return vmul_lane_s32(a, b, 1);
 }
 
 // CHECK-LABEL: @test_vmul_lane_f32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <2 x i32> <i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
 // CHECK:   ret <2 x float> [[MUL]]
 float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t b) {
   return vmul_lane_f32(a, b, 1);
 }
 
 // CHECK-LABEL: @test_vmul_lane_u16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
 // CHECK:   ret <4 x i16> [[MUL]]
 uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t b) {
   return vmul_lane_u16(a, b, 3);
 }
 
 // CHECK-LABEL: @test_vmul_lane_u32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
 // CHECK:   ret <2 x i32> [[MUL]]
 uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t b) {
   return vmul_lane_u32(a, b, 1);
 }
 
 // CHECK-LABEL: @test_vmulq_lane_s16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
 // CHECK:   ret <8 x i16> [[MUL]]
 int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t b) {
   return vmulq_lane_s16(a, b, 3);
 }
 
 // CHECK-LABEL: @test_vmulq_lane_s32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
 // CHECK:   ret <4 x i32> [[MUL]]
 int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t b) {
   return vmulq_lane_s32(a, b, 1);
 }
 
 // CHECK-LABEL: @test_vmulq_lane_f32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
 // CHECK:   ret <4 x float> [[MUL]]
 float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t b) {
   return vmulq_lane_f32(a, b, 1);
 }
 
 // CHECK-LABEL: @test_vmulq_lane_u16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
 // CHECK:   ret <8 x i16> [[MUL]]
 uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t b) {
   return vmulq_lane_u16(a, b, 3);
 }
 
 // CHECK-LABEL: @test_vmulq_lane_u32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
 // CHECK:   ret <4 x i32> [[MUL]]
 uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t b) {
   return vmulq_lane_u32(a, b, 1);
@@ -9700,24 +9816,28 @@ int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
 }
 
 // CHECK-LABEL: @test_vqdmlal_lane_s16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #8
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #8
 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlal_lane_s16(a, b, c, 3);
 }
 
 // CHECK-LABEL: @test_vqdmlal_lane_s32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #8
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #8
 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlal_lane_s32(a, b, c, 1);
@@ -9774,24 +9894,28 @@ int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
 }
 
 // CHECK-LABEL: @test_vqdmlsl_lane_s16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #8
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #8
 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlsl_lane_s16(a, b, c, 3);
 }
 
 // CHECK-LABEL: @test_vqdmlsl_lane_s32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #8
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #8
 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlsl_lane_s32(a, b, c, 1);
@@ -9866,10 +9990,12 @@ int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) {
 }
 
 // CHECK-LABEL: @test_vqdmulh_lane_s16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[LANE]]) #8
 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
 // CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t b) {
@@ -9877,10 +10003,12 @@ int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t b) {
 }
 
 // CHECK-LABEL: @test_vqdmulh_lane_s32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[LANE]]) #8
 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
 // CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t b) {
@@ -9888,10 +10016,12 @@ int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t b) {
 }
 
 // CHECK-LABEL: @test_vqdmulhq_lane_s16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[LANE]]) #8
 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
 // CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
@@ -9899,10 +10029,12 @@ int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
 }
 
 // CHECK-LABEL: @test_vqdmulhq_lane_s32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[LANE]]) #8
 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
 // CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
@@ -9988,10 +10120,12 @@ int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) {
 }
 
 // CHECK-LABEL: @test_vqdmull_lane_s16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #8
 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
 int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t b) {
@@ -9999,10 +10133,12 @@ int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t b) {
 }
 
 // CHECK-LABEL: @test_vqdmull_lane_s32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #8
 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
 int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t b) {
@@ -10204,10 +10340,12 @@ int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) {
 }
 
 // CHECK-LABEL: @test_vqrdmulh_lane_s16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[LANE]]) #8
 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
 // CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t b) {
@@ -10215,10 +10353,12 @@ int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t b) {
 }
 
 // CHECK-LABEL: @test_vqrdmulh_lane_s32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[LANE]]) #8
 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
 // CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t b) {
@@ -10226,10 +10366,12 @@ int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t b) {
 }
 
 // CHECK-LABEL: @test_vqrdmulhq_lane_s16(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[LANE]]) #8
 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
 // CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
@@ -10237,10 +10379,12 @@ int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
 }
 
 // CHECK-LABEL: @test_vqrdmulhq_lane_s32(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[LANE]]) #8
 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
 // CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t b) {

diff  --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index 59ea15493f03..9166e7a718ec 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -27,8 +27,9 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/None.h"
-#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
@@ -238,6 +239,11 @@ class Type {
     NumVectors = 1;
   }
 
+  void make32BitElement() {
+    assert_with_loc(Bitwidth > 32, "Not enough bits to make it 32!");
+    ElementBitwidth = 32;
+  }
+
   void doubleLanes() {
     assert_with_loc(Bitwidth != 128, "Can't get bigger than 128!");
     Bitwidth = 128;
@@ -518,7 +524,8 @@ class Intrinsic {
     std::pair<Type, std::string> emitDagDupTyped(DagInit *DI);
     std::pair<Type, std::string> emitDagShuffle(DagInit *DI);
     std::pair<Type, std::string> emitDagCast(DagInit *DI, bool IsBitCast);
-    std::pair<Type, std::string> emitDagCall(DagInit *DI);
+    std::pair<Type, std::string> emitDagCall(DagInit *DI,
+                                             bool MatchMangledName);
     std::pair<Type, std::string> emitDagNameReplace(DagInit *DI);
     std::pair<Type, std::string> emitDagLiteral(DagInit *DI);
     std::pair<Type, std::string> emitDagOp(DagInit *DI);
@@ -546,7 +553,8 @@ class NeonEmitter {
 public:
   /// Called by Intrinsic - this attempts to get an intrinsic that takes
   /// the given types as arguments.
-  Intrinsic &getIntrinsic(StringRef Name, ArrayRef<Type> Types);
+  Intrinsic &getIntrinsic(StringRef Name, ArrayRef<Type> Types,
+                          Optional<std::string> MangledName);
 
   /// Called by Intrinsic - returns a globally-unique number.
   unsigned getUniqueNumber() { return UniqueNumber++; }
@@ -1383,8 +1391,8 @@ std::pair<Type, std::string> Intrinsic::DagEmitter::emitDag(DagInit *DI) {
     return emitDagSaveTemp(DI);
   if (Op == "op")
     return emitDagOp(DI);
-  if (Op == "call")
-    return emitDagCall(DI);
+  if (Op == "call" || Op == "call_mangled")
+    return emitDagCall(DI, Op == "call_mangled");
   if (Op == "name_replace")
     return emitDagNameReplace(DI);
   if (Op == "literal")
@@ -1411,7 +1419,8 @@ std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagOp(DagInit *DI) {
   }
 }
 
-std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagCall(DagInit *DI) {
+std::pair<Type, std::string>
+Intrinsic::DagEmitter::emitDagCall(DagInit *DI, bool MatchMangledName) {
   std::vector<Type> Types;
   std::vector<std::string> Values;
   for (unsigned I = 0; I < DI->getNumArgs() - 1; ++I) {
@@ -1427,7 +1436,13 @@ std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagCall(DagInit *DI) {
     N = SI->getAsUnquotedString();
   else
     N = emitDagArg(DI->getArg(0), "").second;
-  Intrinsic &Callee = Intr.Emitter.getIntrinsic(N, Types);
+  Optional<std::string> MangledName;
+  if (MatchMangledName) {
+    if (Intr.getRecord()->getValueAsBit("isLaneQ"))
+      N += "q";
+    MangledName = Intr.mangleName(N, ClassS);
+  }
+  Intrinsic &Callee = Intr.Emitter.getIntrinsic(N, Types, MangledName);
 
   // Make sure the callee is known as an early def.
   Callee.setNeededEarly();
@@ -1486,6 +1501,8 @@ std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagCast(DagInit *DI,
         castToType.doubleLanes();
       } else if (SI->getAsUnquotedString() == "8") {
         castToType.makeInteger(8, true);
+      } else if (SI->getAsUnquotedString() == "32") {
+        castToType.make32BitElement();
       } else {
         castToType = Type::fromTypedefName(SI->getAsUnquotedString());
         assert_with_loc(!castToType.isVoid(), "Unknown typedef");
@@ -1832,7 +1849,8 @@ void Intrinsic::indexBody() {
 // NeonEmitter implementation
 //===----------------------------------------------------------------------===//
 
-Intrinsic &NeonEmitter::getIntrinsic(StringRef Name, ArrayRef<Type> Types) {
+Intrinsic &NeonEmitter::getIntrinsic(StringRef Name, ArrayRef<Type> Types,
+                                     Optional<std::string> MangledName) {
   // First, look up the name in the intrinsic map.
   assert_with_loc(IntrinsicMap.find(Name.str()) != IntrinsicMap.end(),
                   ("Intrinsic '" + Name + "' not found!").str());
@@ -1861,17 +1879,19 @@ Intrinsic &NeonEmitter::getIntrinsic(StringRef Name, ArrayRef<Type> Types) {
     }
     ErrMsg += ")\n";
 
+    if (MangledName && MangledName != I.getMangledName(true))
+      continue;
+
     if (I.getNumParams() != Types.size())
       continue;
 
-    bool Good = true;
-    for (unsigned Arg = 0; Arg < Types.size(); ++Arg) {
-      if (I.getParamType(Arg) != Types[Arg]) {
-        Good = false;
-        break;
-      }
-    }
-    if (Good)
+    unsigned ArgNum = 0;
+    bool MatchingArgumentTypes =
+        std::all_of(Types.begin(), Types.end(), [&](const auto &Type) {
+          return Type == I.getParamType(ArgNum++);
+        });
+
+    if (MatchingArgumentTypes)
       GoodVec.push_back(&I);
   }