[llvm] 2939fc1 - [AArch64] Add IR intrinsics for sq(r)dmulh_lane(q)

Wed Jan 29 05:34:10 PST 2020

Author: Sanne Wouda
Date: 2020-01-29T13:25:23Z
New Revision: 2939fc13c8f6a5dbd1be77c1d19dc2720253b8c5

URL: https://github.com/llvm/llvm-project/commit/2939fc13c8f6a5dbd1be77c1d19dc2720253b8c5
DIFF: https://github.com/llvm/llvm-project/commit/2939fc13c8f6a5dbd1be77c1d19dc2720253b8c5.diff

LOG: [AArch64] Add IR intrinsics for sq(r)dmulh_lane(q)

Summary:
Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h),
are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated)
indices, like so:

   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)

When %v's values are known, the shufflevector is optimized away and we are no
longer able to select the lane variant of sqdmulh in the backend.

This defeats a (hand-coded) optimization that packs several constants into a
single vector and uses the lane intrinsics to reduce register pressure and
trade-off materialising several constants for a single vector load from the
constant pool, like so:

   int16x8_t v = {2,3,4,5,6,7,8,9};
   a = vqdmulh_laneq_s16(a, v, 0);
   b = vqdmulh_laneq_s16(b, v, 1);
   c = vqdmulh_laneq_s16(c, v, 2);
   d = vqdmulh_laneq_s16(d, v, 3);
   [...]

In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4%
performance difference.

We could teach the compiler to recover the lane variants, but this would likely
require its own pass.  (Alternatively, "volatile" could be used on the constants
vector, but this is a bit ugly.)

This patch instead implements the following LLVM IR intrinsics for AArch64 to
maintain the original structure through IR optmization and into instruction
selection:
- sqdmulh_lane
- sqdmulh_laneq
- sqrdmulh_lane
- sqrdmulh_laneq.

These 'lane' variants need an additional register class.  The second argument
must be in the lower half of the 64-bit NEON register file, but only when
operating on i16 elements.

Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane
(etc.) remain, so code that does not rely on NEON intrinsics to generate these
instructions is not affected.

This patch also changes clang to emit these IR intrinsics for the corresponding
NEON intrinsics (AArch64 only).

Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma

Reviewed By: efriedma

Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D71469

Added: 
    

Modified: 
    clang/include/clang/Basic/arm_neon.td
    clang/lib/CodeGen/CGBuiltin.cpp
    clang/test/CodeGen/aarch64-neon-2velem.c
    llvm/include/llvm/IR/IntrinsicsAArch64.td
    llvm/lib/Target/AArch64/AArch64InstrFormats.td
    llvm/lib/Target/AArch64/AArch64InstrInfo.td
    llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
    llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
    llvm/lib/Target/AArch64/AArch64RegisterInfo.td
    llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
    llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index a4dc21b64311..380a2a0a5fa6 100644

--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -528,9 +528,16 @@ def VMULL_LANE    : SOpInst<"vmull_lane", "(>Q)..I", "siUsUi", OP_MULL_LN>;
 def VQDMULL_N     : SOpInst<"vqdmull_n", "(>Q).1", "si", OP_QDMULL_N>;
 def VQDMULL_LANE  : SOpInst<"vqdmull_lane", "(>Q)..I", "si", OP_QDMULL_LN>;
 def VQDMULH_N     : SOpInst<"vqdmulh_n", "..1", "siQsQi", OP_QDMULH_N>;
-def VQDMULH_LANE  : SOpInst<"vqdmulh_lane", "..qI", "siQsQi", OP_QDMULH_LN>;
 def VQRDMULH_N    : SOpInst<"vqrdmulh_n", "..1", "siQsQi", OP_QRDMULH_N>;
+
+let ArchGuard = "!defined(__aarch64__)" in {
+def VQDMULH_LANE  : SOpInst<"vqdmulh_lane", "..qI", "siQsQi", OP_QDMULH_LN>;
 def VQRDMULH_LANE : SOpInst<"vqrdmulh_lane", "..qI", "siQsQi", OP_QRDMULH_LN>;
+}
+let ArchGuard = "defined(__aarch64__)" in {
+def A64_VQDMULH_LANE  : SInst<"vqdmulh_lane", "..qI", "siQsQi">;
+def A64_VQRDMULH_LANE : SInst<"vqrdmulh_lane", "..qI", "siQsQi">;
+}
 
 let ArchGuard = "defined(__ARM_FEATURE_QRDMX)" in {
 def VQRDMLAH_LANE : SOpInst<"vqrdmlah_lane", "...qI", "siQsQi", OP_QRDMLAH_LN>;
@@ -951,9 +958,10 @@ def VQDMULL_HIGH_LANE   : SOpInst<"vqdmull_high_lane", "(>Q)Q.I", "si",
 def VQDMULL_HIGH_LANEQ  : SOpInst<"vqdmull_high_laneq", "(>Q)QQI", "si",
                                   OP_QDMULLHi_LN>;
 
-def VQDMULH_LANEQ  : SOpInst<"vqdmulh_laneq", "..QI", "siQsQi", OP_QDMULH_LN>;
-def VQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "..QI", "siQsQi", OP_QRDMULH_LN>;
-
+let isLaneQ = 1 in {
+def VQDMULH_LANEQ  : SInst<"vqdmulh_laneq", "..QI", "siQsQi">;
+def VQRDMULH_LANEQ : SInst<"vqrdmulh_laneq", "..QI", "siQsQi">;
+}
 let ArchGuard = "defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__)" in {
 def VQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "...QI", "siQsQi", OP_QRDMLAH_LN>;
 def VQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "...QI", "siQsQi", OP_QRDMLSH_LN>;

diff  --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index a48f6d2215b3..d6640e5d1bdc 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4969,14 +4969,22 @@ static const NeonIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
   NEONMAP2(vqaddq_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts),
   NEONMAP2(vqdmlal_v, aarch64_neon_sqdmull, aarch64_neon_sqadd, 0),
   NEONMAP2(vqdmlsl_v, aarch64_neon_sqdmull, aarch64_neon_sqsub, 0),
+  NEONMAP1(vqdmulh_lane_v, aarch64_neon_sqdmulh_lane, 0),
+  NEONMAP1(vqdmulh_laneq_v, aarch64_neon_sqdmulh_laneq, 0),
   NEONMAP1(vqdmulh_v, aarch64_neon_sqdmulh, Add1ArgType),
+  NEONMAP1(vqdmulhq_lane_v, aarch64_neon_sqdmulh_lane, 0),
+  NEONMAP1(vqdmulhq_laneq_v, aarch64_neon_sqdmulh_laneq, 0),
   NEONMAP1(vqdmulhq_v, aarch64_neon_sqdmulh, Add1ArgType),
   NEONMAP1(vqdmull_v, aarch64_neon_sqdmull, Add1ArgType),
   NEONMAP2(vqmovn_v, aarch64_neon_uqxtn, aarch64_neon_sqxtn, Add1ArgType | UnsignedAlts),
   NEONMAP1(vqmovun_v, aarch64_neon_sqxtun, Add1ArgType),
   NEONMAP1(vqneg_v, aarch64_neon_sqneg, Add1ArgType),
   NEONMAP1(vqnegq_v, aarch64_neon_sqneg, Add1ArgType),
+  NEONMAP1(vqrdmulh_lane_v, aarch64_neon_sqrdmulh_lane, 0),
+  NEONMAP1(vqrdmulh_laneq_v, aarch64_neon_sqrdmulh_laneq, 0),
   NEONMAP1(vqrdmulh_v, aarch64_neon_sqrdmulh, Add1ArgType),
+  NEONMAP1(vqrdmulhq_lane_v, aarch64_neon_sqrdmulh_lane, 0),
+  NEONMAP1(vqrdmulhq_laneq_v, aarch64_neon_sqrdmulh_laneq, 0),
   NEONMAP1(vqrdmulhq_v, aarch64_neon_sqrdmulh, Add1ArgType),
   NEONMAP2(vqrshl_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
   NEONMAP2(vqrshlq_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
@@ -5754,6 +5762,24 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
     Ops.resize(2);
     return EmitNeonCall(CGM.getIntrinsic(AltLLVMIntrinsic, Ty), Ops, NameHint);
   }
+  case NEON::BI__builtin_neon_vqdmulhq_lane_v:
+  case NEON::BI__builtin_neon_vqdmulh_lane_v:
+  case NEON::BI__builtin_neon_vqrdmulhq_lane_v:
+  case NEON::BI__builtin_neon_vqrdmulh_lane_v: {
+    llvm::Type *Tys[2] = {
+        Ty, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
+                                            /*isQuad*/ false))};
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
+  }
+  case NEON::BI__builtin_neon_vqdmulhq_laneq_v:
+  case NEON::BI__builtin_neon_vqdmulh_laneq_v:
+  case NEON::BI__builtin_neon_vqrdmulhq_laneq_v:
+  case NEON::BI__builtin_neon_vqrdmulh_laneq_v: {
+    llvm::Type *Tys[2] = {
+        Ty, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
+                                            /*isQuad*/ true))};
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
+  }
   case NEON::BI__builtin_neon_vqshl_n_v:
   case NEON::BI__builtin_neon_vqshlq_n_v:
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshl_n",

diff  --git a/clang/test/CodeGen/aarch64-neon-2velem.c b/clang/test/CodeGen/aarch64-neon-2velem.c
index 37cdb16eb0e7..5ad06cfff808 100644
--- a/clang/test/CodeGen/aarch64-neon-2velem.c
+++ b/clang/test/CodeGen/aarch64-neon-2velem.c
@@ -1440,12 +1440,12 @@ int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmulh_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <4 x i16> [[VQDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 3)
+// CHECK-NEXT:    ret <4 x i16> [[VQDMULH_LANE_V2]]
 //
 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
   return vqdmulh_lane_s16(a, v, 3);
@@ -1453,12 +1453,12 @@ int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqdmulhq_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 3)
+// CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_LANE_V2]]
 //
 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
   return vqdmulhq_lane_s16(a, v, 3);
@@ -1466,12 +1466,12 @@ int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqdmulh_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <2 x i32> [[VQDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 1)
+// CHECK-NEXT:    ret <2 x i32> [[VQDMULH_LANE_V2]]
 //
 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
   return vqdmulh_lane_s32(a, v, 1);
@@ -1479,12 +1479,12 @@ int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqdmulhq_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_LANE_V2]]
 //
 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
   return vqdmulhq_lane_s32(a, v, 1);
@@ -1492,12 +1492,12 @@ int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqrdmulh_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 3)
+// CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_LANE_V2]]
 //
 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
   return vqrdmulh_lane_s16(a, v, 3);
@@ -1505,12 +1505,12 @@ int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqrdmulhq_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 3)
+// CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_LANE_V2]]
 //
 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
   return vqrdmulhq_lane_s16(a, v, 3);
@@ -1518,12 +1518,12 @@ int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqrdmulh_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 1)
+// CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_LANE_V2]]
 //
 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
   return vqrdmulh_lane_s32(a, v, 1);
@@ -1531,12 +1531,12 @@ int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqrdmulhq_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_LANE_V2]]
 //
 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
   return vqrdmulhq_lane_s32(a, v, 1);
@@ -3066,12 +3066,12 @@ int64x2_t test_vqdmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmulh_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <4 x i16> [[VQDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 0)
+// CHECK-NEXT:    ret <4 x i16> [[VQDMULH_LANE_V2]]
 //
 int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
   return vqdmulh_lane_s16(a, v, 0);
@@ -3079,12 +3079,12 @@ int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqdmulhq_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 0)
+// CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_LANE_V2]]
 //
 int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
   return vqdmulhq_lane_s16(a, v, 0);
@@ -3092,12 +3092,12 @@ int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqdmulh_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <2 x i32> [[VQDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 0)
+// CHECK-NEXT:    ret <2 x i32> [[VQDMULH_LANE_V2]]
 //
 int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
   return vqdmulh_lane_s32(a, v, 0);
@@ -3105,12 +3105,12 @@ int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqdmulhq_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 0)
+// CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_LANE_V2]]
 //
 int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
   return vqdmulhq_lane_s32(a, v, 0);
@@ -3118,12 +3118,12 @@ int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqrdmulh_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 0)
+// CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_LANE_V2]]
 //
 int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
   return vqrdmulh_lane_s16(a, v, 0);
@@ -3131,12 +3131,12 @@ int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqrdmulhq_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 0)
+// CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_LANE_V2]]
 //
 int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
   return vqrdmulhq_lane_s16(a, v, 0);
@@ -3144,12 +3144,12 @@ int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqrdmulh_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 0)
+// CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_LANE_V2]]
 //
 int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
   return vqrdmulh_lane_s32(a, v, 0);
@@ -3157,12 +3157,12 @@ int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqrdmulhq_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 0)
+// CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_LANE_V2]]
 //
 int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
   return vqrdmulhq_lane_s32(a, v, 0);
@@ -4753,12 +4753,12 @@ int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmulh_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <4 x i16> [[VQDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 0)
+// CHECK-NEXT:    ret <4 x i16> [[VQDMULH_LANEQ_V2]]
 //
 int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
   return vqdmulh_laneq_s16(a, v, 0);
@@ -4766,12 +4766,12 @@ int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqdmulhq_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 0)
+// CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_LANEQ_V2]]
 //
 int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
   return vqdmulhq_laneq_s16(a, v, 0);
@@ -4779,12 +4779,12 @@ int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqdmulh_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <2 x i32> [[VQDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 0)
+// CHECK-NEXT:    ret <2 x i32> [[VQDMULH_LANEQ_V2]]
 //
 int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
   return vqdmulh_laneq_s32(a, v, 0);
@@ -4792,12 +4792,12 @@ int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmulhq_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 0)
+// CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_LANEQ_V2]]
 //
 int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
   return vqdmulhq_laneq_s32(a, v, 0);
@@ -4805,12 +4805,12 @@ int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqrdmulh_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 0)
+// CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_LANEQ_V2]]
 //
 int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
   return vqrdmulh_laneq_s16(a, v, 0);
@@ -4818,12 +4818,12 @@ int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqrdmulhq_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 0)
+// CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_LANEQ_V2]]
 //
 int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
   return vqrdmulhq_laneq_s16(a, v, 0);
@@ -4831,12 +4831,12 @@ int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqrdmulh_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 0)
+// CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_LANEQ_V2]]
 //
 int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
   return vqrdmulh_laneq_s32(a, v, 0);
@@ -4844,12 +4844,12 @@ int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqrdmulhq_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 0)
+// CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_LANEQ_V2]]
 //
 int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
   return vqrdmulhq_laneq_s32(a, v, 0);
@@ -5149,12 +5149,12 @@ int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmulh_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <4 x i16> [[VQDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 7)
+// CHECK-NEXT:    ret <4 x i16> [[VQDMULH_LANEQ_V2]]
 //
 int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
   return vqdmulh_laneq_s16(a, v, 7);
@@ -5162,12 +5162,12 @@ int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqdmulhq_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 7)
+// CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_LANEQ_V2]]
 //
 int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
   return vqdmulhq_laneq_s16(a, v, 7);
@@ -5175,12 +5175,12 @@ int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqdmulh_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <2 x i32> [[VQDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 3)
+// CHECK-NEXT:    ret <2 x i32> [[VQDMULH_LANEQ_V2]]
 //
 int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
   return vqdmulh_laneq_s32(a, v, 3);
@@ -5188,12 +5188,12 @@ int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmulhq_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 3)
+// CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_LANEQ_V2]]
 //
 int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
   return vqdmulhq_laneq_s32(a, v, 3);
@@ -5201,12 +5201,12 @@ int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqrdmulh_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 7)
+// CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_LANEQ_V2]]
 //
 int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
   return vqrdmulh_laneq_s16(a, v, 7);
@@ -5214,12 +5214,12 @@ int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqrdmulhq_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 7)
+// CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_LANEQ_V2]]
 //
 int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
   return vqrdmulhq_laneq_s16(a, v, 7);
@@ -5227,12 +5227,12 @@ int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqrdmulh_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 3)
+// CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_LANEQ_V2]]
 //
 int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
   return vqrdmulh_laneq_s32(a, v, 3);
@@ -5240,12 +5240,12 @@ int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqrdmulhq_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 3)
+// CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_LANEQ_V2]]
 //
 int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
   return vqrdmulhq_laneq_s32(a, v, 3);

diff  --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 8c550b7c8593..6fbcfe8b6690 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -133,6 +133,10 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
     : Intrinsic<[llvm_anyvector_ty],
                 [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty],
                 [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Lane_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMMatchType<0>, llvm_anyint_ty, llvm_i32_ty],
+                [IntrNoMem]>;
 
   class AdvSIMD_3VectorArg_Intrinsic
       : Intrinsic<[llvm_anyvector_ty],
@@ -207,9 +211,13 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
 
   // Vector Saturating Doubling Multiply High
   def int_aarch64_neon_sqdmulh : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_sqdmulh_lane : AdvSIMD_2VectorArg_Lane_Intrinsic;
+  def int_aarch64_neon_sqdmulh_laneq : AdvSIMD_2VectorArg_Lane_Intrinsic;
 
   // Vector Saturating Rounding Doubling Multiply High
   def int_aarch64_neon_sqrdmulh : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_sqrdmulh_lane : AdvSIMD_2VectorArg_Lane_Intrinsic;
+  def int_aarch64_neon_sqrdmulh_laneq : AdvSIMD_2VectorArg_Lane_Intrinsic;
 
   // Vector Polynominal Multiply
   def int_aarch64_neon_pmul : AdvSIMD_2VectorArg_Intrinsic;

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index d91ef35afecd..db27a53963da 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -360,6 +360,9 @@ def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
 def am_indexedu6s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedU6S128", []>;
 def am_indexeds9s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedS9S128", []>;
 
+def UImmS1XForm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i64);
+}]>;
 def UImmS2XForm : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getZExtValue() / 2, SDLoc(N), MVT::i64);
 }]>;
@@ -7968,6 +7971,64 @@ multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> {
   }
 }
 
+multiclass SIMDIndexedHSPatterns<SDPatternOperator OpNodeLane,
+                                 SDPatternOperator OpNodeLaneQ> {
+
+  def : Pat<(v4i16 (OpNodeLane
+                     (v4i16 V64:$Rn), (v4i16 V64_lo:$Rm),
+                     VectorIndexS32b:$idx)),
+            (!cast<Instruction>(NAME # v4i16_indexed) $Rn,
+              (SUBREG_TO_REG (i32 0), (v4i16 V64_lo:$Rm), dsub),
+              (UImmS1XForm $idx))>;
+
+  def : Pat<(v4i16 (OpNodeLaneQ
+                     (v4i16 V64:$Rn), (v8i16 V128_lo:$Rm),
+                     VectorIndexH32b:$idx)),
+            (!cast<Instruction>(NAME # v4i16_indexed) $Rn, $Rm,
+              (UImmS1XForm $idx))>;
+
+  def : Pat<(v8i16 (OpNodeLane
+                     (v8i16 V128:$Rn), (v4i16 V64_lo:$Rm),
+                     VectorIndexS32b:$idx)),
+            (!cast<Instruction>(NAME # v8i16_indexed) $Rn,
+              (SUBREG_TO_REG (i32 0), $Rm, dsub),
+              (UImmS1XForm $idx))>;
+
+  def : Pat<(v8i16 (OpNodeLaneQ
+                     (v8i16 V128:$Rn), (v8i16 V128_lo:$Rm),
+                     VectorIndexH32b:$idx)),
+            (!cast<Instruction>(NAME # v8i16_indexed) $Rn, $Rm,
+              (UImmS1XForm $idx))>;
+
+  def : Pat<(v2i32 (OpNodeLane
+                     (v2i32 V64:$Rn), (v2i32 V64:$Rm),
+                     VectorIndexD32b:$idx)),
+            (!cast<Instruction>(NAME # v2i32_indexed) $Rn,
+              (SUBREG_TO_REG (i32 0), (v2i32 V64_lo:$Rm), dsub),
+              (UImmS1XForm $idx))>;
+
+  def : Pat<(v2i32 (OpNodeLaneQ
+                     (v2i32 V64:$Rn), (v4i32 V128:$Rm),
+                     VectorIndexS32b:$idx)),
+            (!cast<Instruction>(NAME # v2i32_indexed) $Rn, $Rm,
+              (UImmS1XForm $idx))>;
+
+  def : Pat<(v4i32 (OpNodeLane
+                     (v4i32 V128:$Rn), (v2i32 V64:$Rm),
+                     VectorIndexD32b:$idx)),
+            (!cast<Instruction>(NAME # v4i32_indexed) $Rn,
+              (SUBREG_TO_REG (i32 0), $Rm, dsub),
+              (UImmS1XForm $idx))>;
+
+  def : Pat<(v4i32 (OpNodeLaneQ
+                     (v4i32 V128:$Rn),
+                     (v4i32 V128:$Rm),
+                     VectorIndexS32b:$idx)),
+            (!cast<Instruction>(NAME # v4i32_indexed) $Rn, $Rm,
+              (UImmS1XForm $idx))>;
+
+}
+
 multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
                          SDPatternOperator OpNode> {
   def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64,

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index c2853da050f1..9cf1a5166fe9 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -5631,6 +5631,11 @@ def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
 defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>;
 defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
 
+defm SQDMULH : SIMDIndexedHSPatterns<int_aarch64_neon_sqdmulh_lane,
+                                     int_aarch64_neon_sqdmulh_laneq>;
+defm SQRDMULH : SIMDIndexedHSPatterns<int_aarch64_neon_sqrdmulh_lane,
+                                      int_aarch64_neon_sqrdmulh_laneq>;
+
 // Generated by MachineCombine
 defm MLA   : SIMDVectorIndexedHSTied<1, 0b0000, "mla", null_frag>;
 defm MLS   : SIMDVectorIndexedHSTied<1, 0b0100, "mls", null_frag>;

diff  --git a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
index 40efac261fd9..58308378401c 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -230,6 +230,7 @@ AArch64RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
   case AArch64::FPR16RegClassID:
   case AArch64::FPR32RegClassID:
   case AArch64::FPR64RegClassID:
+  case AArch64::FPR64_loRegClassID:
   case AArch64::FPR128RegClassID:
   case AArch64::FPR128_loRegClassID:
   case AArch64::DDRegClassID:

diff  --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index cdfbc0f5f691..61834874a869 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -596,6 +596,7 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
     return 32;
 
   case AArch64::FPR128_loRegClassID:
+  case AArch64::FPR64_loRegClassID:
     return 16;
   }
 }

diff  --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index f52feab03953..4d8939195b5b 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -429,6 +429,10 @@ def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)>;
 def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32,
                                     v1i64, v4f16],
                                     64, (sequence "D%u", 0, 31)>;
+def FPR64_lo : RegisterClass<"AArch64",
+                             [v8i8, v4i16, v2i32, v1i64, v4f16, v2f32, v1f64],
+                             64, (trunc FPR64, 16)>;
+
 // We don't (yet) have an f128 legal type, so don't use that here. We
 // normalize 128-bit vectors to v2f64 for arg passing and such, so use
 // that here.
@@ -503,6 +507,9 @@ def VectorRegLoAsmOperand : AsmOperandClass {
   let Name = "VectorRegLo";
   let PredicateMethod = "isNeonVectorRegLo";
 }
+def V64_lo : RegisterOperand<FPR64_lo, "printVRegOperand"> {
+  let ParserMatchClass = VectorRegLoAsmOperand;
+}
 def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand"> {
   let ParserMatchClass = VectorRegLoAsmOperand;
 }

diff  --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 5c7697a8bbcc..fb09cc247a1a 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -1033,8 +1033,10 @@ class AArch64Operand : public MCParsedAsmOperand {
 
   bool isNeonVectorRegLo() const {
     return Kind == k_Register && Reg.Kind == RegKind::NeonVector &&
-           AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
-               Reg.RegNum);
+           (AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
+                Reg.RegNum) ||
+            AArch64MCRegisterClasses[AArch64::FPR64_loRegClassID].contains(
+                Reg.RegNum));
   }
 
   template <unsigned Class> bool isSVEVectorReg() const {

diff  --git a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
index 26826789107e..eee0d77d98eb 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -9,20 +9,36 @@ declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
 declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
 
 declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32>, <2 x i32>, i32)
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32>, <4 x i32>, i32)
 
 declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32>, <2 x i32>, i32)
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32>, <4 x i32>, i32)
 
 declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16>, <4 x i16>, i32)
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16>, <8 x i16>, i32)
 
 declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16>, <4 x i16>, i32)
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16>, <8 x i16>, i32)
 
 declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32>, <2 x i32>, i32)
+declare <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32>, <4 x i32>, i32)
 
 declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32>, <2 x i32>, i32)
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32>, <4 x i32>, i32)
 
 declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16>, <4 x i16>, i32)
+declare <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16>, <8 x i16>, i32)
 
 declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16>, <4 x i16>, i32)
+declare <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16>, <8 x i16>, i32)
 
 declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
 
@@ -1515,6 +1531,37 @@ entry:
   ret <4 x i16> %vqdmulh2.i
 }
 
+define <4 x i16> @test_vqdmulh_lane_s16_intrinsic(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmulh_lane_s16_intrinsic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqdmulh v0.4h, v0.4h, v1.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> %a, <4 x i16> %v, i32 3)
+  ret <4 x i16> %vqdmulh2.i
+}
+
+define <4 x i16> @test_vqdmulh_laneq_s16_intrinsic_lo(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqdmulh_laneq_s16_intrinsic_lo:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmulh v0.4h, v0.4h, v1.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> %a, <8 x i16> %v, i32 3)
+  ret <4 x i16> %vqdmulh2.i
+}
+
+define <4 x i16> @test_vqdmulh_laneq_s16_intrinsic_hi(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqdmulh_laneq_s16_intrinsic_hi:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmulh v0.4h, v0.4h, v1.h[7]
+; CHECK-NEXT:    ret
+entry:
+  %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> %a, <8 x i16> %v, i32 7)
+  ret <4 x i16> %vqdmulh2.i
+}
+
 define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmulhq_lane_s16:
 ; CHECK:       // %bb.0: // %entry
@@ -1527,6 +1574,37 @@ entry:
   ret <8 x i16> %vqdmulh2.i
 }
 
+define <8 x i16> @test_vqdmulhq_lane_s16_intrinsic(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmulhq_lane_s16_intrinsic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqdmulh v0.8h, v0.8h, v1.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> %a, <4 x i16> %v, i32 3)
+  ret <8 x i16> %vqdmulh2.i
+}
+
+define <8 x i16> @test_vqdmulhq_laneq_s16_intrinsic_lo(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqdmulhq_laneq_s16_intrinsic_lo:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmulh v0.8h, v0.8h, v1.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> %a, <8 x i16> %v, i32 3)
+  ret <8 x i16> %vqdmulh2.i
+}
+
+define <8 x i16> @test_vqdmulhq_laneq_s16_intrinsic_hi(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqdmulhq_laneq_s16_intrinsic_hi:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmulh v0.8h, v0.8h, v1.h[7]
+; CHECK-NEXT:    ret
+entry:
+  %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> %a, <8 x i16> %v, i32 7)
+  ret <8 x i16> %vqdmulh2.i
+}
+
 define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmulh_lane_s32:
 ; CHECK:       // %bb.0: // %entry
@@ -1539,6 +1617,37 @@ entry:
   ret <2 x i32> %vqdmulh2.i
 }
 
+define <2 x i32> @test_vqdmulh_lane_s32_intrinsic(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmulh_lane_s32_intrinsic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqdmulh v0.2s, v0.2s, v1.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> %a, <2 x i32> %v, i32 1)
+  ret <2 x i32> %vqdmulh2.i
+}
+
+define <2 x i32> @test_vqdmulh_laneq_s32_intrinsic_lo(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqdmulh_laneq_s32_intrinsic_lo:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmulh v0.2s, v0.2s, v1.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> %a, <4 x i32> %v, i32 1)
+  ret <2 x i32> %vqdmulh2.i
+}
+
+define <2 x i32> @test_vqdmulh_laneq_s32_intrinsic_hi(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqdmulh_laneq_s32_intrinsic_hi:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmulh v0.2s, v0.2s, v1.s[3]
+; CHECK-NEXT:    ret
+entry:
+  %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> %a, <4 x i32> %v, i32 3)
+  ret <2 x i32> %vqdmulh2.i
+}
+
 define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmulhq_lane_s32:
 ; CHECK:       // %bb.0: // %entry
@@ -1551,6 +1660,37 @@ entry:
   ret <4 x i32> %vqdmulh2.i
 }
 
+define <4 x i32> @test_vqdmulhq_lane_s32_intrinsic(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmulhq_lane_s32_intrinsic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqdmulh v0.4s, v0.4s, v1.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> %a, <2 x i32> %v, i32 1)
+  ret <4 x i32> %vqdmulh2.i
+}
+
+define <4 x i32> @test_vqdmulhq_laneq_s32_intrinsic_lo(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqdmulhq_laneq_s32_intrinsic_lo:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmulh v0.4s, v0.4s, v1.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> %a, <4 x i32> %v, i32 1)
+  ret <4 x i32> %vqdmulh2.i
+}
+
+define <4 x i32> @test_vqdmulhq_laneq_s32_intrinsic_hi(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqdmulhq_laneq_s32_intrinsic_hi:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmulh v0.4s, v0.4s, v1.s[3]
+; CHECK-NEXT:    ret
+entry:
+  %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> %a, <4 x i32> %v, i32 3)
+  ret <4 x i32> %vqdmulh2.i
+}
+
 define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqrdmulh_lane_s16:
 ; CHECK:       // %bb.0: // %entry
@@ -1563,6 +1703,37 @@ entry:
   ret <4 x i16> %vqrdmulh2.i
 }
 
+define <4 x i16> @test_vqrdmulh_lane_s16_intrinsic(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulh_lane_s16_intrinsic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqrdmulh v0.4h, v0.4h, v1.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> %a, <4 x i16> %v, i32 3)
+  ret <4 x i16> %vqrdmulh2.i
+}
+
+define <4 x i16> @test_vqrdmulh_laneq_s16_intrinsic_lo(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulh_laneq_s16_intrinsic_lo:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmulh v0.4h, v0.4h, v1.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> %a, <8 x i16> %v, i32 3)
+  ret <4 x i16> %vqrdmulh2.i
+}
+
+define <4 x i16> @test_vqrdmulh_laneq_s16_intrinsic_hi(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulh_laneq_s16_intrinsic_hi:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmulh v0.4h, v0.4h, v1.h[7]
+; CHECK-NEXT:    ret
+entry:
+  %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> %a, <8 x i16> %v, i32 7)
+  ret <4 x i16> %vqrdmulh2.i
+}
+
 define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqrdmulhq_lane_s16:
 ; CHECK:       // %bb.0: // %entry
@@ -1575,6 +1746,37 @@ entry:
   ret <8 x i16> %vqrdmulh2.i
 }
 
+define <8 x i16> @test_vqrdmulhq_lane_s16_intrinsic(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulhq_lane_s16_intrinsic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqrdmulh v0.8h, v0.8h, v1.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> %a, <4 x i16> %v, i32 3)
+  ret <8 x i16> %vqrdmulh2.i
+}
+
+define <8 x i16> @test_vqrdmulhq_laneq_s16_intrinsic_lo(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulhq_laneq_s16_intrinsic_lo:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmulh v0.8h, v0.8h, v1.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> %a, <8 x i16> %v, i32 3)
+  ret <8 x i16> %vqrdmulh2.i
+}
+
+define <8 x i16> @test_vqrdmulhq_laneq_s16_intrinsic_hi(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulhq_laneq_s16_intrinsic_hi:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmulh v0.8h, v0.8h, v1.h[7]
+; CHECK-NEXT:    ret
+entry:
+  %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> %a, <8 x i16> %v, i32 7)
+  ret <8 x i16> %vqrdmulh2.i
+}
+
 define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqrdmulh_lane_s32:
 ; CHECK:       // %bb.0: // %entry
@@ -1587,6 +1789,37 @@ entry:
   ret <2 x i32> %vqrdmulh2.i
 }
 
+define <2 x i32> @test_vqrdmulh_lane_s32_intrinsic(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulh_lane_s32_intrinsic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqrdmulh v0.2s, v0.2s, v1.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> %a, <2 x i32> %v, i32 1)
+  ret <2 x i32> %vqrdmulh2.i
+}
+
+define <2 x i32> @test_vqrdmulh_laneq_s32_intrinsic_lo(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulh_laneq_s32_intrinsic_lo:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmulh v0.2s, v0.2s, v1.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> %a, <4 x i32> %v, i32 1)
+  ret <2 x i32> %vqrdmulh2.i
+}
+
+define <2 x i32> @test_vqrdmulh_laneq_s32_intrinsic_hi(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulh_laneq_s32_intrinsic_hi:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmulh v0.2s, v0.2s, v1.s[3]
+; CHECK-NEXT:    ret
+entry:
+  %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> %a, <4 x i32> %v, i32 3)
+  ret <2 x i32> %vqrdmulh2.i
+}
+
 define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqrdmulhq_lane_s32:
 ; CHECK:       // %bb.0: // %entry
@@ -1599,6 +1832,37 @@ entry:
   ret <4 x i32> %vqrdmulh2.i
 }
 
+define <4 x i32> @test_vqrdmulhq_lane_s32_intrinsic(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulhq_lane_s32_intrinsic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqrdmulh v0.4s, v0.4s, v1.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> %a, <2 x i32> %v, i32 1)
+  ret <4 x i32> %vqrdmulh2.i
+}
+
+define <4 x i32> @test_vqrdmulhq_laneq_s32_intrinsic_lo(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulhq_laneq_s32_intrinsic_lo:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmulh v0.4s, v0.4s, v1.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> %a, <4 x i32> %v, i32 1)
+  ret <4 x i32> %vqrdmulh2.i
+}
+
+define <4 x i32> @test_vqrdmulhq_laneq_s32_intrinsic_hi(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulhq_laneq_s32_intrinsic_hi:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmulh v0.4s, v0.4s, v1.s[3]
+; CHECK-NEXT:    ret
+entry:
+  %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> %a, <4 x i32> %v, i32 3)
+  ret <4 x i32> %vqrdmulh2.i
+}
+
 define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) {
 ; CHECK-LABEL: test_vmul_lane_f32:
 ; CHECK:       // %bb.0: // %entry