r263048 - ARM & AArch64: convert asm tests to LLVM IR and restrict optimizations.

Tim Northover via cfe-commits cfe-commits at lists.llvm.org
Wed Mar 9 10:54:43 PST 2016


Author: tnorthover
Date: Wed Mar  9 12:54:42 2016
New Revision: 263048

URL: http://llvm.org/viewvc/llvm-project?rev=263048&view=rev
Log:
ARM & AArch64: convert asm tests to LLVM IR and restrict optimizations.

This is mostly a one-time autoconversion of tests that checked assembly after
"-Owhatever" compiles to only run "opt -mem2reg" and check the assembly. This
should make them much more stable to changes in LLVM so they won't break on
unrelated changes.

"opt -mem2reg" is a compromise designed to increase the readability of tests
that check dataflow, while minimizing dependency on LLVM. Hopefully mem2reg is
stable enough that no surpises will come along.

Should address http://llvm.org/PR26815.

Modified:
    cfe/trunk/test/CodeGen/aarch64-neon-2velem.c
    cfe/trunk/test/CodeGen/aarch64-neon-3v.c
    cfe/trunk/test/CodeGen/aarch64-neon-across.c
    cfe/trunk/test/CodeGen/aarch64-neon-extract.c
    cfe/trunk/test/CodeGen/aarch64-neon-fcvt-intrinsics.c
    cfe/trunk/test/CodeGen/aarch64-neon-fma.c
    cfe/trunk/test/CodeGen/aarch64-neon-intrinsics.c
    cfe/trunk/test/CodeGen/aarch64-neon-ldst-one.c
    cfe/trunk/test/CodeGen/aarch64-neon-misc.c
    cfe/trunk/test/CodeGen/aarch64-neon-perm.c
    cfe/trunk/test/CodeGen/aarch64-neon-scalar-copy.c
    cfe/trunk/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c
    cfe/trunk/test/CodeGen/aarch64-neon-shifts.c
    cfe/trunk/test/CodeGen/aarch64-neon-tbl.c
    cfe/trunk/test/CodeGen/aarch64-neon-vcombine.c
    cfe/trunk/test/CodeGen/aarch64-neon-vget-hilo.c
    cfe/trunk/test/CodeGen/aarch64-neon-vget.c
    cfe/trunk/test/CodeGen/aarch64-poly128.c
    cfe/trunk/test/CodeGen/aarch64-poly64.c
    cfe/trunk/test/CodeGen/arm-bitfield-alignment.c
    cfe/trunk/test/CodeGen/arm-crc32.c
    cfe/trunk/test/CodeGen/arm-neon-directed-rounding.c
    cfe/trunk/test/CodeGen/arm-neon-fma.c
    cfe/trunk/test/CodeGen/arm-neon-numeric-maxmin.c
    cfe/trunk/test/CodeGen/arm-neon-shifts.c
    cfe/trunk/test/CodeGen/arm-neon-vcvtX.c
    cfe/trunk/test/CodeGen/arm-neon-vget.c
    cfe/trunk/test/CodeGen/arm64-be-bitfield.c
    cfe/trunk/test/CodeGen/arm64-crc32.c
    cfe/trunk/test/CodeGen/arm64-lanes.c
    cfe/trunk/test/CodeGen/arm64_vcopy.c
    cfe/trunk/test/CodeGen/arm64_vcreate.c
    cfe/trunk/test/CodeGen/arm64_vdupq_n_f64.c
    cfe/trunk/test/CodeGen/arm_neon_intrinsics.c
    cfe/trunk/test/CodeGen/builtins-arm-exclusive.c
    cfe/trunk/test/CodeGen/builtins-arm.c
    cfe/trunk/test/CodeGen/builtins-arm64.c
    cfe/trunk/test/CodeGen/fp128_complex.c
    cfe/trunk/test/CodeGen/neon-immediate-ubsan.c

Modified: cfe/trunk/test/CodeGen/aarch64-neon-2velem.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/aarch64-neon-2velem.c?rev=263048&r1=263047&r2=263048&view=diff
==============================================================================
--- cfe/trunk/test/CodeGen/aarch64-neon-2velem.c (original)
+++ cfe/trunk/test/CodeGen/aarch64-neon-2velem.c Wed Mar  9 12:54:42 2016
@@ -1,2452 +1,5011 @@
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmla_lane_s16
   return vmla_lane_s16(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_s16
   return vmlaq_lane_s16(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmla_lane_s32
   return vmla_lane_s32(a, b, v, 1);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_s32
   return vmlaq_lane_s32(a, b, v, 1);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmla_laneq_s16
   return vmla_laneq_s16(a, b, v, 7);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_s16
   return vmlaq_laneq_s16(a, b, v, 7);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmla_laneq_s32
   return vmla_laneq_s32(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_s32
   return vmlaq_laneq_s32(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmls_lane_s16
   return vmls_lane_s16(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_s16
   return vmlsq_lane_s16(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmls_lane_s32
   return vmls_lane_s32(a, b, v, 1);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_s32
   return vmlsq_lane_s32(a, b, v, 1);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmls_laneq_s16
   return vmls_laneq_s16(a, b, v, 7);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_s16
   return vmlsq_laneq_s16(a, b, v, 7);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmls_laneq_s32
   return vmls_laneq_s32(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_s32
   return vmlsq_laneq_s32(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmul_lane_s16
   return vmul_lane_s16(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmulq_lane_s16
   return vmulq_lane_s16(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmul_lane_s32
   return vmul_lane_s32(a, v, 1);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmulq_lane_s32
   return vmulq_lane_s32(a, v, 1);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmul_lane_u16
   return vmul_lane_u16(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmulq_lane_u16
   return vmulq_lane_u16(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmul_lane_u32
   return vmul_lane_u32(a, v, 1);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmulq_lane_u32
   return vmulq_lane_u32(a, v, 1);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmul_laneq_s16
   return vmul_laneq_s16(a, v, 7);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_s16
   return vmulq_laneq_s16(a, v, 7);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmul_laneq_s32
   return vmul_laneq_s32(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_s32
   return vmulq_laneq_s32(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmul_laneq_u16
   return vmul_laneq_u16(a, v, 7);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_u16
   return vmulq_laneq_u16(a, v, 7);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmul_laneq_u32
   return vmul_laneq_u32(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_u32
   return vmulq_laneq_u32(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA]]1)
+// CHECK:   ret <2 x float> [[FMLA]]2
 float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfma_lane_f32
   return vfma_lane_f32(a, b, v, 1);
-  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA]]1)
+// CHECK:   ret <4 x float> [[FMLA]]2
 float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfmaq_lane_f32
   return vfmaq_lane_f32(a, b, v, 1);
-  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfma_laneq_f32
   return vfma_laneq_f32(a, b, v, 3);
-  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfmaq_laneq_f32
   return vfmaq_laneq_f32(a, b, v, 3);
-  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA]]1)
+// CHECK:   ret <2 x float> [[FMLA]]2
 float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfms_lane_f32
   return vfms_lane_f32(a, b, v, 1);
-  // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA]]1)
+// CHECK:   ret <4 x float> [[FMLA]]2
 float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfmsq_lane_f32
   return vfmsq_lane_f32(a, b, v, 1);
-  // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfms_laneq_f32
   return vfms_laneq_f32(a, b, v, 3);
-  // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfmsq_laneq_f32
   return vfmsq_laneq_f32(a, b, v, 3);
-  // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA]]1)
+// CHECK:   ret <2 x double> [[FMLA]]2
 float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
-  // CHECK-LABEL: test_vfmaq_lane_f64
   return vfmaq_lane_f64(a, b, v, 0);
-  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
+// CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
-  // CHECK-LABEL: test_vfmaq_laneq_f64
   return vfmaq_laneq_f64(a, b, v, 1);
-  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <1 x double> <double -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA]]1)
+// CHECK:   ret <2 x double> [[FMLA]]2
 float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
-  // CHECK-LABEL: test_vfmsq_lane_f64
   return vfmsq_lane_f64(a, b, v, 0);
-  // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
+// CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
-  // CHECK-LABEL: test_vfmsq_laneq_f64
   return vfmsq_laneq_f64(a, b, v, 1);
-  // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
+// CHECK:   ret float [[TMP2]]
 float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfmas_laneq_f32
   return vfmas_laneq_f32(a, b, v, 3);
-  // CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <1 x double> <double -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
+// CHECK:   ret double [[TMP2]]
 float64_t test_vfmsd_lane_f64(float64_t a, float64_t b, float64x1_t v) {
-  // CHECK-LABEL: test_vfmsd_lane_f64
   return vfmsd_lane_f64(a, b, v, 0);
-  // CHECK: {{fmls d[0-9]+, d[0-9]+, v[0-9]+\.d\[0\]|fmsub d[0-9]+, d[0-9]+, d[0-9]+}}
 }
 
+// CHECK-LABEL: define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
+// CHECK:   ret float [[TMP2]]
 float32_t test_vfmss_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfmss_laneq_f32
   return vfmss_laneq_f32(a, b, v, 3);
-  // CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
+// CHECK:   ret double [[TMP2]]
 float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) {
-  // CHECK-LABEL: test_vfmsd_laneq_f64
   return vfmsd_laneq_f64(a, b, v, 1);
-  // CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_lane_s16
   return vmlal_lane_s16(a, b, v, 3);
-  // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_lane_s32
   return vmlal_lane_s32(a, b, v, 1);
-  // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_s16
   return vmlal_laneq_s16(a, b, v, 7);
-  // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_s32
   return vmlal_laneq_s32(a, b, v, 3);
-  // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_s16
   return vmlal_high_lane_s16(a, b, v, 3);
-  // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_s32
   return vmlal_high_lane_s32(a, b, v, 1);
-  // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_s16
   return vmlal_high_laneq_s16(a, b, v, 7);
-  // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_s32
   return vmlal_high_laneq_s32(a, b, v, 3);
-  // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_s16
   return vmlsl_lane_s16(a, b, v, 3);
-  // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_s32
   return vmlsl_lane_s32(a, b, v, 1);
-  // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_s16
   return vmlsl_laneq_s16(a, b, v, 7);
-  // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_s32
   return vmlsl_laneq_s32(a, b, v, 3);
-  // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_s16
   return vmlsl_high_lane_s16(a, b, v, 3);
-  // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_s32
   return vmlsl_high_lane_s32(a, b, v, 1);
-  // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_s16
   return vmlsl_high_laneq_s16(a, b, v, 7);
-  // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_s32
   return vmlsl_high_laneq_s32(a, b, v, 3);
-  // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_lane_u16
   return vmlal_lane_u16(a, b, v, 3);
-  // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_lane_u32
   return vmlal_lane_u32(a, b, v, 1);
-  // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_u16
   return vmlal_laneq_u16(a, b, v, 7);
-  // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_u32
   return vmlal_laneq_u32(a, b, v, 3);
-  // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_u16
   return vmlal_high_lane_u16(a, b, v, 3);
-  // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_u32
   return vmlal_high_lane_u32(a, b, v, 1);
-  // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_u16
   return vmlal_high_laneq_u16(a, b, v, 7);
-  // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_u32
   return vmlal_high_laneq_u32(a, b, v, 3);
-  // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_u16
   return vmlsl_lane_u16(a, b, v, 3);
-  // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_u32
   return vmlsl_lane_u32(a, b, v, 1);
-  // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_u16
   return vmlsl_laneq_u16(a, b, v, 7);
-  // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_u32
   return vmlsl_laneq_u32(a, b, v, 3);
-  // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_u16
   return vmlsl_high_lane_u16(a, b, v, 3);
-  // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_u32
   return vmlsl_high_lane_u32(a, b, v, 1);
-  // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_u16
   return vmlsl_high_laneq_u16(a, b, v, 7);
-  // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_u32
   return vmlsl_high_laneq_u32(a, b, v, 3);
-  // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmull_lane_s16
   return vmull_lane_s16(a, v, 3);
-  // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmull_lane_s32
   return vmull_lane_s32(a, v, 1);
-  // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmull_lane_u16
   return vmull_lane_u16(a, v, 3);
-  // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmull_lane_u32
   return vmull_lane_u32(a, v, 1);
-  // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_s16
   return vmull_high_lane_s16(a, v, 3);
-  // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_s32
   return vmull_high_lane_s32(a, v, 1);
-  // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_u16
   return vmull_high_lane_u16(a, v, 3);
-  // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_u32
   return vmull_high_lane_u32(a, v, 1);
-  // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmull_laneq_s16
   return vmull_laneq_s16(a, v, 7);
-  // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmull_laneq_s32
   return vmull_laneq_s32(a, v, 3);
-  // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmull_laneq_u16
   return vmull_laneq_u16(a, v, 7);
-  // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmull_laneq_u32
   return vmull_laneq_u32(a, v, 3);
-  // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_s16
   return vmull_high_laneq_s16(a, v, 7);
-  // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_s32
   return vmull_high_laneq_s32(a, v, 3);
-  // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_u16
   return vmull_high_laneq_u16(a, v, 7);
-  // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_u32
   return vmull_high_laneq_u32(a, v, 3);
-  // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_lane_s16
   return vqdmlal_lane_s16(a, b, v, 3);
-  // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlal_lane_s32
   return vqdmlal_lane_s32(a, b, v, 1);
-  // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_lane_s16
   return vqdmlal_high_lane_s16(a, b, v, 3);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_lane_s32
   return vqdmlal_high_lane_s32(a, b, v, 1);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_lane_s16
   return vqdmlsl_lane_s16(a, b, v, 3);
-  // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlsl_lane_s32
   return vqdmlsl_lane_s32(a, b, v, 1);
-  // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_lane_s16
   return vqdmlsl_high_lane_s16(a, b, v, 3);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_lane_s32
   return vqdmlsl_high_lane_s32(a, b, v, 1);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmull_lane_s16
   return vqdmull_lane_s16(a, v, 3);
-  // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmull_lane_s32
   return vqdmull_lane_s32(a, v, 1);
-  // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmull_laneq_s16
   return vqdmull_laneq_s16(a, v, 3);
-  // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmull_laneq_s32
   return vqdmull_laneq_s32(a, v, 3);
-  // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmull_high_lane_s16
   return vqdmull_high_lane_s16(a, v, 3);
-  // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmull_high_lane_s32
   return vqdmull_high_lane_s32(a, v, 1);
-  // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmull_high_laneq_s16
   return vqdmull_high_laneq_s16(a, v, 7);
-  // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmull_high_laneq_s32
   return vqdmull_high_laneq_s32(a, v, 3);
-  // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmulh_lane_s16
   return vqdmulh_lane_s16(a, v, 3);
-  // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmulhq_lane_s16
   return vqdmulhq_lane_s16(a, v, 3);
-  // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmulh_lane_s32
   return vqdmulh_lane_s32(a, v, 1);
-  // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmulhq_lane_s32
   return vqdmulhq_lane_s32(a, v, 1);
-  // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqrdmulh_lane_s16
   return vqrdmulh_lane_s16(a, v, 3);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_lane_s16
   return vqrdmulhq_lane_s16(a, v, 3);
-  // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqrdmulh_lane_s32
   return vqrdmulh_lane_s32(a, v, 1);
-  // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_lane_s32
   return vqrdmulhq_lane_s32(a, v, 1);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x float> [[MUL]]
 float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmul_lane_f32
   return vmul_lane_f32(a, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
 
+// CHECK-LABEL: define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %v to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
+// CHECK:   [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
+// CHECK:   [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP5]]
 float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) {
-  // CHECK-LABEL: test_vmul_lane_f64
   return vmul_lane_f64(a, v, 0);
-  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+\.d\[0\]|d[0-9]+}}
 }
 
 
+// CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x float> [[MUL]]
 float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmulq_lane_f32
   return vmulq_lane_f32(a, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x double> [[MUL]]
 float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) {
-  // CHECK-LABEL: test_vmulq_lane_f64
   return vmulq_lane_f64(a, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x float> [[MUL]]
 float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmul_laneq_f32
   return vmul_laneq_f32(a, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+// CHECK:   [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
+// CHECK:   [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP5]]
 float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) {
-  // CHECK-LABEL: test_vmul_laneq_f64
   return vmul_laneq_f64(a, v, 1);
-  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
 
+// CHECK-LABEL: define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x float> [[MUL]]
 float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_f32
   return vmulq_laneq_f32(a, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x double> [[MUL]]
 float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_f64
   return vmulq_laneq_f64(a, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
+// CHECK:   ret <2 x float> [[VMULX2_I]]
 float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmulx_lane_f32
   return vmulx_lane_f32(a, v, 1);
-  // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
+// CHECK:   ret <4 x float> [[VMULX2_I]]
 float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmulxq_lane_f32
   return vmulxq_lane_f32(a, v, 1);
-  // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
+// CHECK:   ret <2 x double> [[VMULX2_I]]
 float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) {
-  // CHECK-LABEL: test_vmulxq_lane_f64
   return vmulxq_lane_f64(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
+// CHECK:   ret <2 x float> [[VMULX2_I]]
 float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmulx_laneq_f32
   return vmulx_laneq_f32(a, v, 3);
-  // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
+// CHECK:   ret <4 x float> [[VMULX2_I]]
 float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmulxq_laneq_f32
   return vmulxq_laneq_f32(a, v, 3);
-  // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
+// CHECK:   ret <2 x double> [[VMULX2_I]]
 float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) {
-  // CHECK-LABEL: test_vmulxq_laneq_f64
   return vmulxq_laneq_f64(a, v, 1);
-  // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 int16x4_t test_vmla_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmla_lane_s16_0
   return vmla_lane_s16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 int16x8_t test_vmlaq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_s16_0
   return vmlaq_lane_s16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 int32x2_t test_vmla_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmla_lane_s32_0
   return vmla_lane_s32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlaq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_s32_0
   return vmlaq_lane_s32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 int16x4_t test_vmla_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmla_laneq_s16_0
   return vmla_laneq_s16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 int16x8_t test_vmlaq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_s16_0
   return vmlaq_laneq_s16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 int32x2_t test_vmla_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmla_laneq_s32_0
   return vmla_laneq_s32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlaq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_s32_0
   return vmlaq_laneq_s32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 int16x4_t test_vmls_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmls_lane_s16_0
   return vmls_lane_s16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 int16x8_t test_vmlsq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_s16_0
   return vmlsq_lane_s16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 int32x2_t test_vmls_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmls_lane_s32_0
   return vmls_lane_s32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_s32_0
   return vmlsq_lane_s32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 int16x4_t test_vmls_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmls_laneq_s16_0
   return vmls_laneq_s16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 int16x8_t test_vmlsq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_s16_0
   return vmlsq_laneq_s16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 int32x2_t test_vmls_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmls_laneq_s32_0
   return vmls_laneq_s32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_s32_0
   return vmlsq_laneq_s32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmul_lane_s16_0
   return vmul_lane_s16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmulq_lane_s16_0
   return vmulq_lane_s16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmul_lane_s32_0
   return vmul_lane_s32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmulq_lane_s32_0
   return vmulq_lane_s32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmul_lane_u16_0
   return vmul_lane_u16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmulq_lane_u16_0
   return vmulq_lane_u16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmul_lane_u32_0
   return vmul_lane_u32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmulq_lane_u32_0
   return vmulq_lane_u32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmul_laneq_s16_0
   return vmul_laneq_s16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_s16_0
   return vmulq_laneq_s16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmul_laneq_s32_0
   return vmul_laneq_s32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_s32_0
   return vmulq_laneq_s32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmul_laneq_u16_0
   return vmul_laneq_u16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_u16_0
   return vmulq_laneq_u16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmul_laneq_u32_0
   return vmul_laneq_u32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_u32_0
   return vmulq_laneq_u32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA]]1)
+// CHECK:   ret <2 x float> [[FMLA]]2
 float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfma_lane_f32_0
   return vfma_lane_f32(a, b, v, 0);
-  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA]]1)
+// CHECK:   ret <4 x float> [[FMLA]]2
 float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfmaq_lane_f32_0
   return vfmaq_lane_f32(a, b, v, 0);
-  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfma_laneq_f32_0
   return vfma_laneq_f32(a, b, v, 0);
-  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfmaq_laneq_f32_0
   return vfmaq_laneq_f32(a, b, v, 0);
-  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA]]1)
+// CHECK:   ret <2 x float> [[FMLA]]2
 float32x2_t test_vfms_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfms_lane_f32_0
   return vfms_lane_f32(a, b, v, 0);
-  // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA]]1)
+// CHECK:   ret <4 x float> [[FMLA]]2
 float32x4_t test_vfmsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfmsq_lane_f32_0
   return vfmsq_lane_f32(a, b, v, 0);
-  // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_vfms_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfms_laneq_f32_0
   return vfms_laneq_f32(a, b, v, 0);
-  // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfmsq_laneq_f32_0
   return vfmsq_laneq_f32(a, b, v, 0);
-  // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
+// CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
-  // CHECK-LABEL: test_vfmaq_laneq_f64_0
   return vfmaq_laneq_f64(a, b, v, 0);
-  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
+// CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
-  // CHECK-LABEL: test_vfmsq_laneq_f64_0
   return vfmsq_laneq_f64(a, b, v, 0);
-  // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_lane_s16_0
   return vmlal_lane_s16(a, b, v, 0);
-  // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_lane_s32_0
   return vmlal_lane_s32(a, b, v, 0);
-  // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_s16_0
   return vmlal_laneq_s16(a, b, v, 0);
-  // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_s32_0
   return vmlal_laneq_s32(a, b, v, 0);
-  // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_s16_0
   return vmlal_high_lane_s16(a, b, v, 0);
-  // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_s32_0
   return vmlal_high_lane_s32(a, b, v, 0);
-  // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_s16_0
   return vmlal_high_laneq_s16(a, b, v, 0);
-  // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_s32_0
   return vmlal_high_laneq_s32(a, b, v, 0);
-  // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_s16_0
   return vmlsl_lane_s16(a, b, v, 0);
-  // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_s32_0
   return vmlsl_lane_s32(a, b, v, 0);
-  // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_s16_0
   return vmlsl_laneq_s16(a, b, v, 0);
-  // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_s32_0
   return vmlsl_laneq_s32(a, b, v, 0);
-  // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_s16_0
   return vmlsl_high_lane_s16(a, b, v, 0);
-  // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_s32_0
   return vmlsl_high_lane_s32(a, b, v, 0);
-  // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_s16_0
   return vmlsl_high_laneq_s16(a, b, v, 0);
-  // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_s32_0
   return vmlsl_high_laneq_s32(a, b, v, 0);
-  // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_lane_u16_0
   return vmlal_lane_u16(a, b, v, 0);
-  // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_lane_u32_0
   return vmlal_lane_u32(a, b, v, 0);
-  // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_u16_0
   return vmlal_laneq_u16(a, b, v, 0);
-  // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_u32_0
   return vmlal_laneq_u32(a, b, v, 0);
-  // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_u16_0
   return vmlal_high_lane_u16(a, b, v, 0);
-  // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_u32_0
   return vmlal_high_lane_u32(a, b, v, 0);
-  // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_u16_0
   return vmlal_high_laneq_u16(a, b, v, 0);
-  // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_u32_0
   return vmlal_high_laneq_u32(a, b, v, 0);
-  // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_u16_0
   return vmlsl_lane_u16(a, b, v, 0);
-  // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_u32_0
   return vmlsl_lane_u32(a, b, v, 0);
-  // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_u16_0
   return vmlsl_laneq_u16(a, b, v, 0);
-  // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_u32_0
   return vmlsl_laneq_u32(a, b, v, 0);
-  // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_u16_0
   return vmlsl_high_lane_u16(a, b, v, 0);
-  // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_u32_0
   return vmlsl_high_lane_u32(a, b, v, 0);
-  // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_u16_0
   return vmlsl_high_laneq_u16(a, b, v, 0);
-  // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_u32_0
   return vmlsl_high_laneq_u32(a, b, v, 0);
-  // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmull_lane_s16_0
   return vmull_lane_s16(a, v, 0);
-  // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmull_lane_s32_0
   return vmull_lane_s32(a, v, 0);
-  // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmull_lane_u16_0
   return vmull_lane_u16(a, v, 0);
-  // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmull_lane_u32_0
   return vmull_lane_u32(a, v, 0);
-  // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_s16_0
   return vmull_high_lane_s16(a, v, 0);
-  // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_s32_0
   return vmull_high_lane_s32(a, v, 0);
-  // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_u16_0
   return vmull_high_lane_u16(a, v, 0);
-  // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_u32_0
   return vmull_high_lane_u32(a, v, 0);
-  // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmull_laneq_s16_0
   return vmull_laneq_s16(a, v, 0);
-  // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmull_laneq_s32_0
   return vmull_laneq_s32(a, v, 0);
-  // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmull_laneq_u16_0
   return vmull_laneq_u16(a, v, 0);
-  // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmull_laneq_u32_0
   return vmull_laneq_u32(a, v, 0);
-  // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_s16_0
   return vmull_high_laneq_s16(a, v, 0);
-  // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_s32_0
   return vmull_high_laneq_s32(a, v, 0);
-  // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_u16_0
   return vmull_high_laneq_u16(a, v, 0);
-  // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_u32_0
   return vmull_high_laneq_u32(a, v, 0);
-  // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_lane_s16_0
   return vqdmlal_lane_s16(a, b, v, 0);
-  // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlal_lane_s32_0
   return vqdmlal_lane_s32(a, b, v, 0);
-  // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_lane_s16_0
   return vqdmlal_high_lane_s16(a, b, v, 0);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_lane_s32_0
   return vqdmlal_high_lane_s32(a, b, v, 0);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_lane_s16_0
   return vqdmlsl_lane_s16(a, b, v, 0);
-  // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlsl_lane_s32_0
   return vqdmlsl_lane_s32(a, b, v, 0);
-  // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_lane_s16_0
   return vqdmlsl_high_lane_s16(a, b, v, 0);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_lane_s32_0
   return vqdmlsl_high_lane_s32(a, b, v, 0);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmull_lane_s16_0
   return vqdmull_lane_s16(a, v, 0);
-  // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmull_lane_s32_0
   return vqdmull_lane_s32(a, v, 0);
-  // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmull_laneq_s16_0
   return vqdmull_laneq_s16(a, v, 0);
-  // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmull_laneq_s32_0
   return vqdmull_laneq_s32(a, v, 0);
-  // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmull_high_lane_s16_0
   return vqdmull_high_lane_s16(a, v, 0);
-  // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmull_high_lane_s32_0
   return vqdmull_high_lane_s32(a, v, 0);
-  // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmull_high_laneq_s16_0
   return vqdmull_high_laneq_s16(a, v, 0);
-  // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmull_high_laneq_s32_0
   return vqdmull_high_laneq_s32(a, v, 0);
-  // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmulh_lane_s16_0
   return vqdmulh_lane_s16(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmulhq_lane_s16_0
   return vqdmulhq_lane_s16(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmulh_lane_s32_0
   return vqdmulh_lane_s32(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmulhq_lane_s32_0
   return vqdmulhq_lane_s32(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqrdmulh_lane_s16_0
   return vqrdmulh_lane_s16(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_lane_s16_0
   return vqrdmulhq_lane_s16(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqrdmulh_lane_s32_0
   return vqrdmulh_lane_s32(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_lane_s32_0
   return vqrdmulhq_lane_s32(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x float> [[MUL]]
 float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmul_lane_f32_0
   return vmul_lane_f32(a, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x float> [[MUL]]
 float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmulq_lane_f32_0
   return vmulq_lane_f32(a, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x float> [[MUL]]
 float32x2_t test_vmul_laneq_f32_0(float32x2_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmul_laneq_f32_0
   return vmul_laneq_f32(a, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+// CHECK:   [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
+// CHECK:   [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP5]]
 float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) {
-  // CHECK-LABEL: test_vmul_laneq_f64_0
   return vmul_laneq_f64(a, v, 0);
-  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x float> [[MUL]]
 float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_f32_0
   return vmulq_laneq_f32(a, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x double> [[MUL]]
 float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_f64_0
   return vmulq_laneq_f64(a, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
+// CHECK:   ret <2 x float> [[VMULX2_I]]
 float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmulx_lane_f32_0
   return vmulx_lane_f32(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
+// CHECK:   ret <4 x float> [[VMULX2_I]]
 float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmulxq_lane_f32_0
   return vmulxq_lane_f32(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
+// CHECK:   ret <2 x double> [[VMULX2_I]]
 float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) {
-  // CHECK-LABEL: test_vmulxq_lane_f64_0
   return vmulxq_lane_f64(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
+// CHECK:   ret <2 x float> [[VMULX2_I]]
 float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmulx_laneq_f32_0
   return vmulx_laneq_f32(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
+// CHECK:   ret <4 x float> [[VMULX2_I]]
 float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmulxq_laneq_f32_0
   return vmulxq_laneq_f32(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
+// CHECK:   ret <2 x double> [[VMULX2_I]]
 float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
-  // CHECK-LABEL: test_vmulxq_laneq_f64_0
   return vmulxq_laneq_f64(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL4_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL4_I_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL5_I_I]]
 int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) {
-  // CHECK-LABEL: test_vmull_high_n_s16
   return vmull_high_n_s16(a, b);
-  // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL2_I_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL3_I_I]]
 int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) {
-  // CHECK-LABEL: test_vmull_high_n_s32
   return vmull_high_n_s32(a, b);
-  // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL4_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL4_I_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL5_I_I]]
 uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) {
-  // CHECK-LABEL: test_vmull_high_n_u16
   return vmull_high_n_u16(a, b);
-  // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL2_I_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL3_I_I]]
 uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) {
-  // CHECK-LABEL: test_vmull_high_n_u32
   return vmull_high_n_u32(a, b);
-  // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V4_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I_I]], <4 x i16> [[VQDMULL_V4_I_I]]) #2
+// CHECK:   [[VQDMULL_V6_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V6_I_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) {
-  // CHECK-LABEL: test_vqdmull_high_n_s16
   return vqdmull_high_n_s16(a, b);
-  // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I_I]], <2 x i32> [[VQDMULL_V2_I_I]]) #2
+// CHECK:   [[VQDMULL_V4_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V4_I_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) {
-  // CHECK-LABEL: test_vqdmull_high_n_s32
   return vqdmull_high_n_s32(a, b);
-  // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I_I]]
 int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
-  // CHECK-LABEL: test_vmlal_high_n_s16
   return vmlal_high_n_s16(a, b, c);
-  // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I_I]]
 int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
-  // CHECK-LABEL: test_vmlal_high_n_s32
   return vmlal_high_n_s32(a, b, c);
-  // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I_I]]
 uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmlal_high_n_u16
   return vmlal_high_n_u16(a, b, c);
-  // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I_I]]
 uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmlal_high_n_u32
   return vmlal_high_n_u32(a, b, c);
-  // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL4_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL4_I_I]]) #2
+// CHECK:   [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I_I]], <4 x i32> [[VQDMLAL5_I_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V6_I_I]]
 int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
-  // CHECK-LABEL: test_vqdmlal_high_n_s16
   return vqdmlal_high_n_s16(a, b, c);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL2_I_I]]) #2
+// CHECK:   [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I_I]], <2 x i64> [[VQDMLAL3_I_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V4_I_I]]
 int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
-  // CHECK-LABEL: test_vqdmlal_high_n_s32
   return vqdmlal_high_n_s32(a, b, c);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I_I]]
 int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
-  // CHECK-LABEL: test_vmlsl_high_n_s16
   return vmlsl_high_n_s16(a, b, c);
-  // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I_I]]
 int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
-  // CHECK-LABEL: test_vmlsl_high_n_s32
   return vmlsl_high_n_s32(a, b, c);
-  // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I_I]]
 uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmlsl_high_n_u16
   return vmlsl_high_n_u16(a, b, c);
-  // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I_I]]
 uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmlsl_high_n_u32
   return vmlsl_high_n_u32(a, b, c);
-  // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL4_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL4_I_I]]) #2
+// CHECK:   [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I_I]], <4 x i32> [[VQDMLAL5_I_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V6_I_I]]
 int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
-  // CHECK-LABEL: test_vqdmlsl_high_n_s16
   return vqdmlsl_high_n_s16(a, b, c);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL2_I_I]]) #2
+// CHECK:   [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I_I]], <2 x i64> [[VQDMLAL3_I_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V4_I_I]]
 int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
-  // CHECK-LABEL: test_vqdmlsl_high_n_s32
   return vqdmlsl_high_n_s32(a, b, c);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x float> [[MUL_I]]
 float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
-  // CHECK-LABEL: test_vmul_n_f32
   return vmul_n_f32(a, b);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x float> [[MUL_I]]
 float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
-  // CHECK-LABEL: test_vmulq_n_f32
   return vmulq_n_f32(a, b);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulq_n_f64(<2 x double> %a, double %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x double> [[MUL_I]]
 float64x2_t test_vmulq_n_f64(float64x2_t a, float64_t b) {
-  // CHECK-LABEL: test_vmulq_n_f64
   return vmulq_n_f64(a, b);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #2
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
-  // CHECK-LABEL: test_vfma_n_f32
   return vfma_n_f32(a, b, n);
-  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #2
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
-  // CHECK-LABEL: test_vfmaq_n_f32
   return vfmaq_n_f32(a, b, n);
-  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfms_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, [[TMP3]]
+// CHECK:   [[FMLS_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[FMLS1_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[FMLS2_I_I:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLS_I_I]], <2 x float> [[TMP4]], <2 x float> [[FMLS1_I_I]]) #2
+// CHECK:   ret <2 x float> [[FMLS2_I_I]]
 float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
-  // CHECK-LABEL: test_vfms_n_f32
   return vfms_n_f32(a, b, n);
-  // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmsq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[TMP3]]
+// CHECK:   [[FMLS_I_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[FMLS1_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[FMLS2_I_I:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLS_I_I]], <4 x float> [[TMP4]], <4 x float> [[FMLS1_I_I]]) #2
+// CHECK:   ret <4 x float> [[FMLS2_I_I]]
 float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
-  // CHECK-LABEL: test_vfmsq_n_f32
   return vfmsq_n_f32(a, b, n);
-  // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i16> [[MUL_I]]
 int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
-  // CHECK-LABEL: test_vmul_n_s16
   return vmul_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_n_s16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
+// CHECK:   ret <8 x i16> [[MUL_I]]
 int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
-  // CHECK-LABEL: test_vmulq_n_s16
   return vmulq_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x i32> [[MUL_I]]
 int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
-  // CHECK-LABEL: test_vmul_n_s32
   return vmul_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i32> [[MUL_I]]
 int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
-  // CHECK-LABEL: test_vmulq_n_s32
   return vmulq_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.4s, w0
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_n_u16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i16> [[MUL_I]]
 uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
-  // CHECK-LABEL: test_vmul_n_u16
   return vmul_n_u16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_n_u16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
+// CHECK:   ret <8 x i16> [[MUL_I]]
 uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
-  // CHECK-LABEL: test_vmulq_n_u16
   return vmulq_n_u16(a, b);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_n_u32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x i32> [[MUL_I]]
 uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
-  // CHECK-LABEL: test_vmul_n_u32
   return vmul_n_u32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_n_u32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i32> [[MUL_I]]
 uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
-  // CHECK-LABEL: test_vmulq_n_u32
   return vmulq_n_u32(a, b);
-  // CHECK: dup {{v[0-9]+}}.4s, w0
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL5_I]]
 int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
-  // CHECK-LABEL: test_vmull_n_s16
   return vmull_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL3_I]]
 int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
-  // CHECK-LABEL: test_vmull_n_s32
   return vmull_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_n_u16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL5_I]]
 uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
-  // CHECK-LABEL: test_vmull_n_u16
   return vmull_n_u16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_n_u32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL3_I]]
 uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
-  // CHECK-LABEL: test_vmull_n_u32
   return vmull_n_u32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V4_I]]) #2
+// CHECK:   [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V6_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
-  // CHECK-LABEL: test_vqdmull_n_s16
   return vqdmull_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V2_I]]) #2
+// CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V4_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
-  // CHECK-LABEL: test_vqdmull_n_s32
   return vqdmull_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V4_I]]) #2
+// CHECK:   [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V6_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
-  // CHECK-LABEL: test_vqdmulh_n_s16
   return vqdmulh_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_n_s16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V8_I]]) #2
+// CHECK:   [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V10_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
-  // CHECK-LABEL: test_vqdmulhq_n_s16
   return vqdmulhq_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V2_I]]) #2
+// CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V4_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
-  // CHECK-LABEL: test_vqdmulh_n_s32
   return vqdmulh_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V4_I]]) #2
+// CHECK:   [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V6_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
-  // CHECK-LABEL: test_vqdmulhq_n_s32
   return vqdmulhq_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.4s, w0
-  // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V4_I]]) #2
+// CHECK:   [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V6_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
-  // CHECK-LABEL: test_vqrdmulh_n_s16
   return vqrdmulh_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_n_s16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V8_I]]) #2
+// CHECK:   [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V10_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
-  // CHECK-LABEL: test_vqrdmulhq_n_s16
   return vqrdmulhq_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V2_I]]) #2
+// CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V4_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
-  // CHECK-LABEL: test_vqrdmulh_n_s32
   return vqrdmulh_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V4_I]]) #2
+// CHECK:   [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V6_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
-  // CHECK-LABEL: test_vqrdmulhq_n_s32
   return vqrdmulhq_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.4s, w0
-  // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_n_s16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
-  // CHECK-LABEL: test_vmla_n_s16
   return vmla_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
-  // CHECK-LABEL: test_vmlaq_n_s16
   return vmlaq_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
-  // CHECK-LABEL: test_vmla_n_s32
   return vmla_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
-  // CHECK-LABEL: test_vmlaq_n_s32
   return vmlaq_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4s, w0
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_n_u16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmla_n_u16
   return vmla_n_u16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmlaq_n_u16
   return vmlaq_n_u16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmla_n_u32
   return vmla_n_u32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmlaq_n_u32
   return vmlaq_n_u32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4s, w0
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
-  // CHECK-LABEL: test_vmlal_n_s16
   return vmlal_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
-  // CHECK-LABEL: test_vmlal_n_s32
   return vmlal_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_n_u16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmlal_n_u16
   return vmlal_n_u16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmlal_n_u32
   return vmlal_n_u32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V6_I]]
 int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
-  // CHECK-LABEL: test_vqdmlal_n_s16
   return vqdmlal_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V4_I]]
 int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
-  // CHECK-LABEL: test_vqdmlal_n_s32
   return vqdmlal_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_n_s16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
-  // CHECK-LABEL: test_vmls_n_s16
   return vmls_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
-  // CHECK-LABEL: test_vmlsq_n_s16
   return vmlsq_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
-  // CHECK-LABEL: test_vmls_n_s32
   return vmls_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
-  // CHECK-LABEL: test_vmlsq_n_s32
   return vmlsq_n_s32(a, b, c);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_n_u16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[SUB_I]]
 uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmls_n_u16
   return vmls_n_u16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmlsq_n_u16
   return vmlsq_n_u16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmls_n_u32
   return vmls_n_u32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmlsq_n_u32
   return vmlsq_n_u32(a, b, c);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
-  // CHECK-LABEL: test_vmlsl_n_s16
   return vmlsl_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
-  // CHECK-LABEL: test_vmlsl_n_s32
   return vmlsl_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_u16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmlsl_n_u16
   return vmlsl_n_u16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmlsl_n_u32
   return vmlsl_n_u32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V6_I]]
 int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
-  // CHECK-LABEL: test_vqdmlsl_n_s16
   return vqdmlsl_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V4_I]]
 int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
-  // CHECK-LABEL: test_vqdmlsl_n_s32
   return vqdmlsl_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 uint16x4_t test_vmla_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmla_lane_u16_0
   return vmla_lane_u16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 uint16x8_t test_vmlaq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_u16_0
   return vmlaq_lane_u16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 uint32x2_t test_vmla_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmla_lane_u32_0
   return vmla_lane_u32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 uint32x4_t test_vmlaq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_u32_0
   return vmlaq_lane_u32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_laneq_u16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 uint16x4_t test_vmla_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmla_laneq_u16_0
   return vmla_laneq_u16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_u16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 uint16x8_t test_vmlaq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_u16_0
   return vmlaq_laneq_u16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_laneq_u32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 uint32x2_t test_vmla_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmla_laneq_u32_0
   return vmla_laneq_u32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_u32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_u32_0
   return vmlaq_laneq_u32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlal_laneq_s16_0
   return vqdmlal_laneq_s16(a, b, v, 0);
-  // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_laneq_s32_0
   return vqdmlal_laneq_s32(a, b, v, 0);
-  // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_laneq_s16_0
   return vqdmlal_high_laneq_s16(a, b, v, 0);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_laneq_s32_0
   return vqdmlal_high_laneq_s32(a, b, v, 0);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 uint16x4_t test_vmls_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmls_lane_u16_0
   return vmls_lane_u16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 uint16x8_t test_vmlsq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_u16_0
   return vmlsq_lane_u16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 uint32x2_t test_vmls_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmls_lane_u32_0
   return vmls_lane_u32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 uint32x4_t test_vmlsq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_u32_0
   return vmlsq_lane_u32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_laneq_u16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 uint16x4_t test_vmls_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmls_laneq_u16_0
   return vmls_laneq_u16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_u16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 uint16x8_t test_vmlsq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_u16_0
   return vmlsq_laneq_u16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_laneq_u32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 uint32x2_t test_vmls_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmls_laneq_u32_0
   return vmls_laneq_u32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_u32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_u32_0
   return vmlsq_laneq_u32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlsl_laneq_s16_0
   return vqdmlsl_laneq_s16(a, b, v, 0);
-  // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_laneq_s32_0
   return vqdmlsl_laneq_s32(a, b, v, 0);
-  // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_laneq_s16_0
   return vqdmlsl_high_laneq_s16(a, b, v, 0);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_laneq_s32_0
   return vqdmlsl_high_laneq_s32(a, b, v, 0);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmulh_laneq_s16_0
   return vqdmulh_laneq_s16(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmulhq_laneq_s16_0
   return vqdmulhq_laneq_s16(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmulh_laneq_s32_0
   return vqdmulh_laneq_s32(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmulhq_laneq_s32_0
   return vqdmulhq_laneq_s32(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqrdmulh_laneq_s16_0
   return vqrdmulh_laneq_s16(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_laneq_s16_0
   return vqrdmulhq_laneq_s16(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqrdmulh_laneq_s32_0
   return vqrdmulh_laneq_s32(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_laneq_s32_0
   return vqrdmulhq_laneq_s32(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmla_lane_u16
   return vmla_lane_u16(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_u16
   return vmlaq_lane_u16(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmla_lane_u32
   return vmla_lane_u32(a, b, v, 1);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_u32
   return vmlaq_lane_u32(a, b, v, 1);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_laneq_u16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 uint16x4_t test_vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmla_laneq_u16
   return vmla_laneq_u16(a, b, v, 7);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 uint16x8_t test_vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_u16
   return vmlaq_laneq_u16(a, b, v, 7);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_laneq_u32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 uint32x2_t test_vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmla_laneq_u32
   return vmla_laneq_u32(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_u32
   return vmlaq_laneq_u32(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlal_laneq_s16
   return vqdmlal_laneq_s16(a, b, v, 7);
-  // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_laneq_s32
   return vqdmlal_laneq_s32(a, b, v, 3);
-  // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_laneq_s16
   return vqdmlal_high_laneq_s16(a, b, v, 7);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_laneq_s32
   return vqdmlal_high_laneq_s32(a, b, v, 3);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmls_lane_u16
   return vmls_lane_u16(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_u16
   return vmlsq_lane_u16(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmls_lane_u32
   return vmls_lane_u32(a, b, v, 1);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_u32
   return vmlsq_lane_u32(a, b, v, 1);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_laneq_u16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 uint16x4_t test_vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmls_laneq_u16
   return vmls_laneq_u16(a, b, v, 7);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 uint16x8_t test_vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_u16
   return vmlsq_laneq_u16(a, b, v, 7);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_laneq_u32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 uint32x2_t test_vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmls_laneq_u32
   return vmls_laneq_u32(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_u32
   return vmlsq_laneq_u32(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlsl_laneq_s16
   return vqdmlsl_laneq_s16(a, b, v, 7);
-  // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_laneq_s32
   return vqdmlsl_laneq_s32(a, b, v, 3);
-  // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_laneq_s16
   return vqdmlsl_high_laneq_s16(a, b, v, 7);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_laneq_s32
   return vqdmlsl_high_laneq_s32(a, b, v, 3);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmulh_laneq_s16
   return vqdmulh_laneq_s16(a, v, 7);
-  // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmulhq_laneq_s16
   return vqdmulhq_laneq_s16(a, v, 7);
-  // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmulh_laneq_s32
   return vqdmulh_laneq_s32(a, v, 3);
-  // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmulhq_laneq_s32
   return vqdmulhq_laneq_s32(a, v, 3);
-  // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqrdmulh_laneq_s16
   return vqrdmulh_laneq_s16(a, v, 7);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_laneq_s16
   return vqrdmulhq_laneq_s16(a, v, 7);
-  // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqrdmulh_laneq_s32
   return vqrdmulh_laneq_s32(a, v, 3);
-  // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_laneq_s32
   return vqrdmulhq_laneq_s32(a, v, 3);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 

Modified: cfe/trunk/test/CodeGen/aarch64-neon-3v.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/aarch64-neon-3v.c?rev=263048&r1=263047&r2=263048&view=diff
==============================================================================
--- cfe/trunk/test/CodeGen/aarch64-neon-3v.c (original)
+++ cfe/trunk/test/CodeGen/aarch64-neon-3v.c Wed Mar  9 12:54:42 2016
@@ -1,486 +1,597 @@
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <8 x i8> @test_vand_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[AND_I]]
 int8x8_t test_vand_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vand_s8
   return vand_s8(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vandq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[AND_I]]
 int8x16_t test_vandq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vandq_s8
   return vandq_s8(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vand_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[AND_I]]
 int16x4_t test_vand_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vand_s16
   return vand_s16(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vandq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[AND_I]]
 int16x8_t test_vandq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vandq_s16
   return vandq_s16(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vand_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[AND_I]]
 int32x2_t test_vand_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vand_s32
   return vand_s32(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vandq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[AND_I]]
 int32x4_t test_vandq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vandq_s32
   return vandq_s32(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vand_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[AND_I]]
 int64x1_t test_vand_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vand_s64
   return vand_s64(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vandq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[AND_I]]
 int64x2_t test_vandq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vandq_s64
   return vandq_s64(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vand_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[AND_I]]
 uint8x8_t test_vand_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vand_u8
   return vand_u8(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vandq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[AND_I]]
 uint8x16_t test_vandq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vandq_u8
   return vandq_u8(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vand_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[AND_I]]
 uint16x4_t test_vand_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vand_u16
   return vand_u16(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vandq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[AND_I]]
 uint16x8_t test_vandq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vandq_u16
   return vandq_u16(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vand_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[AND_I]]
 uint32x2_t test_vand_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vand_u32
   return vand_u32(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vandq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[AND_I]]
 uint32x4_t test_vandq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vandq_u32
   return vandq_u32(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vand_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[AND_I]]
 uint64x1_t test_vand_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vand_u64
   return vand_u64(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vandq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[AND_I]]
 uint64x2_t test_vandq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vandq_u64
   return vandq_u64(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vorr_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[OR_I]]
 int8x8_t test_vorr_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vorr_s8
   return vorr_s8(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vorrq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[OR_I]]
 int8x16_t test_vorrq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vorrq_s8
   return vorrq_s8(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vorr_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[OR_I]]
 int16x4_t test_vorr_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vorr_s16
   return vorr_s16(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vorrq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[OR_I]]
 int16x8_t test_vorrq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vorrq_s16
   return vorrq_s16(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vorr_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[OR_I]]
 int32x2_t test_vorr_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vorr_s32
   return vorr_s32(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vorrq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[OR_I]]
 int32x4_t test_vorrq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vorrq_s32
   return vorrq_s32(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vorr_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[OR_I]]
 int64x1_t test_vorr_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vorr_s64
   return vorr_s64(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vorrq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[OR_I]]
 int64x2_t test_vorrq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vorrq_s64
   return vorrq_s64(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vorr_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[OR_I]]
 uint8x8_t test_vorr_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vorr_u8
   return vorr_u8(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vorrq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[OR_I]]
 uint8x16_t test_vorrq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vorrq_u8
   return vorrq_u8(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vorr_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[OR_I]]
 uint16x4_t test_vorr_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vorr_u16
   return vorr_u16(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vorrq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[OR_I]]
 uint16x8_t test_vorrq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vorrq_u16
   return vorrq_u16(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vorr_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[OR_I]]
 uint32x2_t test_vorr_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vorr_u32
   return vorr_u32(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vorrq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[OR_I]]
 uint32x4_t test_vorrq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vorrq_u32
   return vorrq_u32(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vorr_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[OR_I]]
 uint64x1_t test_vorr_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vorr_u64
   return vorr_u64(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vorrq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[OR_I]]
 uint64x2_t test_vorrq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vorrq_u64
   return vorrq_u64(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_veor_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[XOR_I]]
 int8x8_t test_veor_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_veor_s8
   return veor_s8(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_veorq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[XOR_I]]
 int8x16_t test_veorq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_veorq_s8
   return veorq_s8(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_veor_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[XOR_I]]
 int16x4_t test_veor_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_veor_s16
   return veor_s16(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_veorq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[XOR_I]]
 int16x8_t test_veorq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_veorq_s16
   return veorq_s16(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_veor_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[XOR_I]]
 int32x2_t test_veor_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_veor_s32
   return veor_s32(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_veorq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[XOR_I]]
 int32x4_t test_veorq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_veorq_s32
   return veorq_s32(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_veor_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[XOR_I]]
 int64x1_t test_veor_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_veor_s64
   return veor_s64(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_veorq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[XOR_I]]
 int64x2_t test_veorq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_veorq_s64
   return veorq_s64(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_veor_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[XOR_I]]
 uint8x8_t test_veor_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_veor_u8
   return veor_u8(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_veorq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[XOR_I]]
 uint8x16_t test_veorq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_veorq_u8
   return veorq_u8(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_veor_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[XOR_I]]
 uint16x4_t test_veor_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_veor_u16
   return veor_u16(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_veorq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[XOR_I]]
 uint16x8_t test_veorq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_veorq_u16
   return veorq_u16(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_veor_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[XOR_I]]
 uint32x2_t test_veor_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_veor_u32
   return veor_u32(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_veorq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[XOR_I]]
 uint32x4_t test_veorq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_veorq_u32
   return veorq_u32(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_veor_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[XOR_I]]
 uint64x1_t test_veor_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_veor_u64
   return veor_u64(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_veorq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[XOR_I]]
 uint64x2_t test_veorq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_veorq_u64
   return veorq_u64(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vbic_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[AND_I]]
 int8x8_t test_vbic_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vbic_s8
   return vbic_s8(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vbicq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[AND_I]]
 int8x16_t test_vbicq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vbicq_s8
   return vbicq_s8(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vbic_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[AND_I]]
 int16x4_t test_vbic_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vbic_s16
   return vbic_s16(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vbicq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[AND_I]]
 int16x8_t test_vbicq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vbicq_s16
   return vbicq_s16(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vbic_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[AND_I]]
 int32x2_t test_vbic_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vbic_s32
   return vbic_s32(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vbicq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[AND_I]]
 int32x4_t test_vbicq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vbicq_s32
   return vbicq_s32(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vbic_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[AND_I]]
 int64x1_t test_vbic_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vbic_s64
   return vbic_s64(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vbicq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[AND_I]]
 int64x2_t test_vbicq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vbicq_s64
   return vbicq_s64(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vbic_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[AND_I]]
 uint8x8_t test_vbic_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vbic_u8
   return vbic_u8(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vbicq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[AND_I]]
 uint8x16_t test_vbicq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vbicq_u8
   return vbicq_u8(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vbic_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[AND_I]]
 uint16x4_t test_vbic_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vbic_u16
   return vbic_u16(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vbicq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[AND_I]]
 uint16x8_t test_vbicq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vbicq_u16
   return vbicq_u16(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vbic_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[AND_I]]
 uint32x2_t test_vbic_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vbic_u32
   return vbic_u32(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vbicq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[AND_I]]
 uint32x4_t test_vbicq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vbicq_u32
   return vbicq_u32(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vbic_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[AND_I]]
 uint64x1_t test_vbic_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vbic_u64
   return vbic_u64(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vbicq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[AND_I]]
 uint64x2_t test_vbicq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vbicq_u64
   return vbicq_u64(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vorn_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[OR_I]]
 int8x8_t test_vorn_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vorn_s8
   return vorn_s8(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vornq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[OR_I]]
 int8x16_t test_vornq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vornq_s8
   return vornq_s8(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vorn_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[OR_I]]
 int16x4_t test_vorn_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vorn_s16
   return vorn_s16(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vornq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[OR_I]]
 int16x8_t test_vornq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vornq_s16
   return vornq_s16(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vorn_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[OR_I]]
 int32x2_t test_vorn_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vorn_s32
   return vorn_s32(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vornq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[OR_I]]
 int32x4_t test_vornq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vornq_s32
   return vornq_s32(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vorn_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[OR_I]]
 int64x1_t test_vorn_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vorn_s64
   return vorn_s64(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vornq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[OR_I]]
 int64x2_t test_vornq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vornq_s64
   return vornq_s64(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vorn_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[OR_I]]
 uint8x8_t test_vorn_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vorn_u8
   return vorn_u8(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vornq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[OR_I]]
 uint8x16_t test_vornq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vornq_u8
   return vornq_u8(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vorn_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[OR_I]]
 uint16x4_t test_vorn_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vorn_u16
   return vorn_u16(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vornq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[OR_I]]
 uint16x8_t test_vornq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vornq_u16
   return vornq_u16(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vorn_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[OR_I]]
 uint32x2_t test_vorn_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vorn_u32
   return vorn_u32(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vornq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[OR_I]]
 uint32x4_t test_vornq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vornq_u32
   return vornq_u32(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vorn_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[OR_I]]
 uint64x1_t test_vorn_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vorn_u64
   return vorn_u64(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vornq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[OR_I]]
 uint64x2_t test_vornq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vornq_u64
   return vornq_u64(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

Modified: cfe/trunk/test/CodeGen/aarch64-neon-across.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/aarch64-neon-across.c?rev=263048&r1=263047&r2=263048&view=diff
==============================================================================
--- cfe/trunk/test/CodeGen/aarch64-neon-across.c (original)
+++ cfe/trunk/test/CodeGen/aarch64-neon-across.c Wed Mar  9 12:54:42 2016
@@ -1,271 +1,398 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define i16 @test_vaddlv_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
+// CHECK:   ret i16 [[TMP0]]
 int16_t test_vaddlv_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vaddlv_s8
   return vaddlv_s8(a);
-  // CHECK: saddlv {{h[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i32 @test_vaddlv_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   ret i32 [[VADDLV_I]]
 int32_t test_vaddlv_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vaddlv_s16
   return vaddlv_s16(a);
-  // CHECK: saddlv {{s[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i16 @test_vaddlv_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
+// CHECK:   ret i16 [[TMP0]]
 uint16_t test_vaddlv_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vaddlv_u8
   return vaddlv_u8(a);
-  // CHECK: uaddlv {{h[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i32 @test_vaddlv_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   ret i32 [[VADDLV_I]]
 uint32_t test_vaddlv_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vaddlv_u16
   return vaddlv_u16(a);
-  // CHECK: uaddlv {{s[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i16 @test_vaddlvq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
+// CHECK:   ret i16 [[TMP0]]
 int16_t test_vaddlvq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vaddlvq_s8
   return vaddlvq_s8(a);
-  // CHECK: saddlv {{h[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i32 @test_vaddlvq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   ret i32 [[VADDLV_I]]
 int32_t test_vaddlvq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vaddlvq_s16
   return vaddlvq_s16(a);
-  // CHECK: saddlv {{s[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i64 @test_vaddlvq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VADDLVQ_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i64 [[VADDLVQ_S32_I]]
 int64_t test_vaddlvq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vaddlvq_s32
   return vaddlvq_s32(a);
-  // CHECK: saddlv {{d[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i16 @test_vaddlvq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
+// CHECK:   ret i16 [[TMP0]]
 uint16_t test_vaddlvq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vaddlvq_u8
   return vaddlvq_u8(a);
-  // CHECK: uaddlv {{h[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i32 @test_vaddlvq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   ret i32 [[VADDLV_I]]
 uint32_t test_vaddlvq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vaddlvq_u16
   return vaddlvq_u16(a);
-  // CHECK: uaddlv {{s[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i64 @test_vaddlvq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VADDLVQ_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i64 [[VADDLVQ_U32_I]]
 uint64_t test_vaddlvq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vaddlvq_u32
   return vaddlvq_u32(a);
-  // CHECK: uaddlv {{d[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i8 @test_vmaxv_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vmaxv_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vmaxv_s8
   return vmaxv_s8(a);
-  // CHECK: smaxv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i16 @test_vmaxv_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vmaxv_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vmaxv_s16
   return vmaxv_s16(a);
-  // CHECK: smaxv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i8 @test_vmaxv_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vmaxv_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vmaxv_u8
   return vmaxv_u8(a);
-  // CHECK: umaxv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i16 @test_vmaxv_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vmaxv_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vmaxv_u16
   return vmaxv_u16(a);
-  // CHECK: umaxv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i8 @test_vmaxvq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vmaxvq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vmaxvq_s8
   return vmaxvq_s8(a);
-  // CHECK: smaxv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i16 @test_vmaxvq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vmaxvq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vmaxvq_s16
   return vmaxvq_s16(a);
-  // CHECK: smaxv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i32 @test_vmaxvq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMAXVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i32 [[VMAXVQ_S32_I]]
 int32_t test_vmaxvq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vmaxvq_s32
   return vmaxvq_s32(a);
-  // CHECK: smaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i8 @test_vmaxvq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vmaxvq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vmaxvq_u8
   return vmaxvq_u8(a);
-  // CHECK: umaxv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i16 @test_vmaxvq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vmaxvq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vmaxvq_u16
   return vmaxvq_u16(a);
-  // CHECK: umaxv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i32 @test_vmaxvq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMAXVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i32 [[VMAXVQ_U32_I]]
 uint32_t test_vmaxvq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vmaxvq_u32
   return vmaxvq_u32(a);
-  // CHECK: umaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i8 @test_vminv_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vminv_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vminv_s8
   return vminv_s8(a);
-  // CHECK: sminv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i16 @test_vminv_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vminv_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vminv_s16
   return vminv_s16(a);
-  // CHECK: sminv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i8 @test_vminv_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vminv_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vminv_u8
   return vminv_u8(a);
-  // CHECK: uminv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i16 @test_vminv_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vminv_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vminv_u16
   return vminv_u16(a);
-  // CHECK: uminv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i8 @test_vminvq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vminvq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vminvq_s8
   return vminvq_s8(a);
-  // CHECK: sminv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i16 @test_vminvq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vminvq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vminvq_s16
   return vminvq_s16(a);
-  // CHECK: sminv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i32 @test_vminvq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMINVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i32 [[VMINVQ_S32_I]]
 int32_t test_vminvq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vminvq_s32
   return vminvq_s32(a);
-  // CHECK: sminv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i8 @test_vminvq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vminvq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vminvq_u8
   return vminvq_u8(a);
-  // CHECK: uminv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i16 @test_vminvq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vminvq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vminvq_u16
   return vminvq_u16(a);
-  // CHECK: uminv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i32 @test_vminvq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMINVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i32 [[VMINVQ_U32_I]]
 uint32_t test_vminvq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vminvq_u32
   return vminvq_u32(a);
-  // CHECK: uminv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i8 @test_vaddv_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vaddv_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vaddv_s8
   return vaddv_s8(a);
-  // CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i16 @test_vaddv_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vaddv_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vaddv_s16
   return vaddv_s16(a);
-  // CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i8 @test_vaddv_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vaddv_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vaddv_u8
   return vaddv_u8(a);
-  // CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i16 @test_vaddv_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vaddv_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vaddv_u16
   return vaddv_u16(a);
-  // CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i8 @test_vaddvq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vaddvq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vaddvq_s8
   return vaddvq_s8(a);
-  // CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i16 @test_vaddvq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vaddvq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vaddvq_s16
   return vaddvq_s16(a);
-  // CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i32 @test_vaddvq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VADDVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i32 [[VADDVQ_S32_I]]
 int32_t test_vaddvq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vaddvq_s32
   return vaddvq_s32(a);
-  // CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i8 @test_vaddvq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vaddvq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vaddvq_u8
   return vaddvq_u8(a);
-  // CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i16 @test_vaddvq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vaddvq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vaddvq_u16
   return vaddvq_u16(a);
-  // CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i32 @test_vaddvq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VADDVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i32 [[VADDVQ_U32_I]]
 uint32_t test_vaddvq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vaddvq_u32
   return vaddvq_u32(a);
-  // CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define float @test_vmaxvq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMAXVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> [[TMP1]]) #2
+// CHECK:   ret float [[VMAXVQ_F32_I]]
 float32_t test_vmaxvq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vmaxvq_f32
   return vmaxvq_f32(a);
-  // CHECK: fmaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define float @test_vminvq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMINVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> [[TMP1]]) #2
+// CHECK:   ret float [[VMINVQ_F32_I]]
 float32_t test_vminvq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vminvq_f32
   return vminvq_f32(a);
-  // CHECK: fminv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define float @test_vmaxnmvq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMAXNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> [[TMP1]]) #2
+// CHECK:   ret float [[VMAXNMVQ_F32_I]]
 float32_t test_vmaxnmvq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vmaxnmvq_f32
   return vmaxnmvq_f32(a);
-  // CHECK: fmaxnmv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define float @test_vminnmvq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMINNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> [[TMP1]]) #2
+// CHECK:   ret float [[VMINNMVQ_F32_I]]
 float32_t test_vminnmvq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vminnmvq_f32
   return vminnmvq_f32(a);
-  // CHECK: fminnmv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }

Modified: cfe/trunk/test/CodeGen/aarch64-neon-extract.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/aarch64-neon-extract.c?rev=263048&r1=263047&r2=263048&view=diff
==============================================================================
--- cfe/trunk/test/CodeGen/aarch64-neon-extract.c (original)
+++ cfe/trunk/test/CodeGen/aarch64-neon-extract.c Wed Mar  9 12:54:42 2016
@@ -1,148 +1,247 @@
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <8 x i8> @test_vext_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+// CHECK:   ret <8 x i8> [[VEXT]]
 int8x8_t test_vext_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vext_s8
   return vext_s8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?2}}
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vext_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i16> [[VEXT]]
 int16x4_t test_vext_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vext_s16
   return vext_s16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?6}}
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vext_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i32> [[VEXT]]
 int32x2_t test_vext_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vext_s32
   return vext_s32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?4}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vext_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[VEXT]]
 int64x1_t test_vext_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vext_s64
   return vext_s64(a, b, 0);
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vextq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+// CHECK:   ret <16 x i8> [[VEXT]]
 int8x16_t test_vextq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vextq_s8
   return vextq_s8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?2}}
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vextq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+// CHECK:   ret <8 x i16> [[VEXT]]
 int16x8_t test_vextq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vextq_s16
   return vextq_s16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?6}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vextq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+// CHECK:   ret <4 x i32> [[VEXT]]
 int32x4_t test_vextq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vextq_s32
   return vextq_s32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?4}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vextq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i64> [[VEXT]]
 int64x2_t test_vextq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vextq_s64
   return vextq_s64(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?8}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vext_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+// CHECK:   ret <8 x i8> [[VEXT]]
 uint8x8_t test_vext_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vext_u8
   return vext_u8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?2}}
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vext_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i16> [[VEXT]]
 uint16x4_t test_vext_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vext_u16
   return vext_u16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?6}}
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vext_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i32> [[VEXT]]
 uint32x2_t test_vext_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vext_u32
   return vext_u32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?4}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vext_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[VEXT]]
 uint64x1_t test_vext_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vext_u64
   return vext_u64(a, b, 0);
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vextq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+// CHECK:   ret <16 x i8> [[VEXT]]
 uint8x16_t test_vextq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vextq_u8
   return vextq_u8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?2}}
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vextq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+// CHECK:   ret <8 x i16> [[VEXT]]
 uint16x8_t test_vextq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vextq_u16
   return vextq_u16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?6}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vextq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+// CHECK:   ret <4 x i32> [[VEXT]]
 uint32x4_t test_vextq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vextq_u32
   return vextq_u32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?4}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vextq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i64> [[VEXT]]
 uint64x2_t test_vextq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vextq_u64
   return vextq_u64(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?8}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vext_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x float> [[VEXT]]
 float32x2_t test_vext_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vext_f32
   return vext_f32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?4}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vext_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VEXT:%.*]] = shufflevector <1 x double> [[TMP2]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x double> [[VEXT]]
 float64x1_t test_vext_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vext_f64
   return vext_f64(a, b, 0);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vextq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+// CHECK:   ret <4 x float> [[VEXT]]
 float32x4_t test_vextq_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vextq_f32
   return vextq_f32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?4}}
 }
 
+// CHECK-LABEL: define <2 x double> @test_vextq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x double> [[VEXT]]
 float64x2_t test_vextq_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vextq_f64
   return vextq_f64(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?8}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vext_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+// CHECK:   ret <8 x i8> [[VEXT]]
 poly8x8_t test_vext_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vext_p8
   return vext_p8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?2}}
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vext_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i16> [[VEXT]]
 poly16x4_t test_vext_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vext_p16
   return vext_p16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?6}}
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vextq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+// CHECK:   ret <16 x i8> [[VEXT]]
 poly8x16_t test_vextq_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vextq_p8
   return vextq_p8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?2}}
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vextq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+// CHECK:   ret <8 x i16> [[VEXT]]
 poly16x8_t test_vextq_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vextq_p16
   return vextq_p16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?6}}
 }

Modified: cfe/trunk/test/CodeGen/aarch64-neon-fcvt-intrinsics.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/aarch64-neon-fcvt-intrinsics.c?rev=263048&r1=263047&r2=263048&view=diff
==============================================================================
--- cfe/trunk/test/CodeGen/aarch64-neon-fcvt-intrinsics.c (original)
+++ cfe/trunk/test/CodeGen/aarch64-neon-fcvt-intrinsics.c Wed Mar  9 12:54:42 2016
@@ -1,133 +1,153 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define float @test_vcvtxd_f32_f64(double %a) #0 {
+// CHECK:   [[VCVTXD_F32_F64_I:%.*]] = call float @llvm.aarch64.sisd.fcvtxn(double %a) #2
+// CHECK:   ret float [[VCVTXD_F32_F64_I]]
 float32_t test_vcvtxd_f32_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtxd_f32_f64
-// CHECK: fcvtxn {{s[0-9]+}}, {{d[0-9]+}}
   return (float32_t)vcvtxd_f32_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtas_s32_f32(float %a) #0 {
+// CHECK:   [[VCVTAS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtas.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTAS_S32_F32_I]]
 int32_t test_vcvtas_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtas_s32_f32
-// CHECK: fcvtas {{[ws][0-9]+}}, {{s[0-9]+}}
   return (int32_t)vcvtas_s32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_test_vcvtad_s64_f64(double %a) #0 {
+// CHECK:   [[VCVTAD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtas.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTAD_S64_F64_I]]
 int64_t test_test_vcvtad_s64_f64(float64_t a) {
-// CHECK-LABEL: test_test_vcvtad_s64_f64
-// CHECK: fcvtas {{[dx][0-9]+}}, {{d[0-9]+}}
   return (int64_t)vcvtad_s64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtas_u32_f32(float %a) #0 {
+// CHECK:   [[VCVTAS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtau.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTAS_U32_F32_I]]
 uint32_t test_vcvtas_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtas_u32_f32
-// CHECK: fcvtau {{[ws][0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vcvtas_u32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtad_u64_f64(double %a) #0 {
+// CHECK:   [[VCVTAD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtau.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTAD_U64_F64_I]]
 uint64_t test_vcvtad_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtad_u64_f64
-// CHECK: fcvtau {{[xd][0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vcvtad_u64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtms_s32_f32(float %a) #0 {
+// CHECK:   [[VCVTMS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtms.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTMS_S32_F32_I]]
 int32_t test_vcvtms_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtms_s32_f32
-// CHECK: fcvtms {{[sw][0-9]+}}, {{s[0-9]+}}
   return (int32_t)vcvtms_s32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtmd_s64_f64(double %a) #0 {
+// CHECK:   [[VCVTMD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtms.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTMD_S64_F64_I]]
 int64_t test_vcvtmd_s64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtmd_s64_f64
-// CHECK: fcvtms {{[dx][0-9]+}}, {{d[0-9]+}}
   return (int64_t)vcvtmd_s64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtms_u32_f32(float %a) #0 {
+// CHECK:   [[VCVTMS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTMS_U32_F32_I]]
 uint32_t test_vcvtms_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtms_u32_f32
-// CHECK: fcvtmu {{[ws][0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vcvtms_u32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtmd_u64_f64(double %a) #0 {
+// CHECK:   [[VCVTMD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTMD_U64_F64_I]]
 uint64_t test_vcvtmd_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtmd_u64_f64
-// CHECK: fcvtmu {{[xd][0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vcvtmd_u64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtns_s32_f32(float %a) #0 {
+// CHECK:   [[VCVTNS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtns.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTNS_S32_F32_I]]
 int32_t test_vcvtns_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtns_s32_f32
-// CHECK: fcvtns {{[sw][0-9]+}}, {{s[0-9]+}}
   return (int32_t)vcvtns_s32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtnd_s64_f64(double %a) #0 {
+// CHECK:   [[VCVTND_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtns.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTND_S64_F64_I]]
 int64_t test_vcvtnd_s64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtnd_s64_f64
-// CHECK: fcvtns {{[dx][0-9]+}}, {{d[0-9]+}}
   return (int64_t)vcvtnd_s64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtns_u32_f32(float %a) #0 {
+// CHECK:   [[VCVTNS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTNS_U32_F32_I]]
 uint32_t test_vcvtns_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtns_u32_f32
-// CHECK: fcvtnu {{[sw][0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vcvtns_u32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtnd_u64_f64(double %a) #0 {
+// CHECK:   [[VCVTND_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTND_U64_F64_I]]
 uint64_t test_vcvtnd_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtnd_u64_f64
-// CHECK: fcvtnu {{[dx][0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vcvtnd_u64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtps_s32_f32(float %a) #0 {
+// CHECK:   [[VCVTPS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtps.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTPS_S32_F32_I]]
 int32_t test_vcvtps_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtps_s32_f32
-// CHECK: fcvtps {{[sw][0-9]+}}, {{s[0-9]+}}
   return (int32_t)vcvtps_s32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtpd_s64_f64(double %a) #0 {
+// CHECK:   [[VCVTPD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtps.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTPD_S64_F64_I]]
 int64_t test_vcvtpd_s64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtpd_s64_f64
-// CHECK: fcvtps {{[dx][0-9]+}}, {{d[0-9]+}}
   return (int64_t)vcvtpd_s64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtps_u32_f32(float %a) #0 {
+// CHECK:   [[VCVTPS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTPS_U32_F32_I]]
 uint32_t test_vcvtps_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtps_u32_f32
-// CHECK: fcvtpu {{[sw][0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vcvtps_u32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtpd_u64_f64(double %a) #0 {
+// CHECK:   [[VCVTPD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTPD_U64_F64_I]]
 uint64_t test_vcvtpd_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtpd_u64_f64
-// CHECK: fcvtpu {{[dx][0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vcvtpd_u64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvts_s32_f32(float %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fptosi float %a to i32
+// CHECK:   ret i32 [[TMP0]]
 int32_t test_vcvts_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvts_s32_f32
-// CHECK: fcvtzs {{[sw][0-9]+}}, {{s[0-9]+}}
   return (int32_t)vcvts_s32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtd_s64_f64(double %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fptosi double %a to i64
+// CHECK:   ret i64 [[TMP0]]
 int64_t test_vcvtd_s64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtd_s64_f64
-// CHECK: fcvtzs {{[dx][0-9]+}}, {{d[0-9]+}}
   return (int64_t)vcvtd_s64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvts_u32_f32(float %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fptoui float %a to i32
+// CHECK:   ret i32 [[TMP0]]
 uint32_t test_vcvts_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvts_u32_f32
-// CHECK: fcvtzu {{[sw][0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vcvts_u32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtd_u64_f64(double %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fptoui double %a to i64
+// CHECK:   ret i64 [[TMP0]]
 uint64_t test_vcvtd_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtd_u64_f64
-// CHECK: fcvtzu {{[dx][0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vcvtd_u64_f64(a);
 }

Modified: cfe/trunk/test/CodeGen/aarch64-neon-fma.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/aarch64-neon-fma.c?rev=263048&r1=263047&r2=263048&view=diff
==============================================================================
--- cfe/trunk/test/CodeGen/aarch64-neon-fma.c (original)
+++ cfe/trunk/test/CodeGen/aarch64-neon-fma.c Wed Mar  9 12:54:42 2016
@@ -1,199 +1,243 @@
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <2 x float> @test_vmla_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]]
+// CHECK:   ret <2 x float> [[ADD_I]]
 float32x2_t test_vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
-  // CHECK-LABEL: test_vmla_n_f32
   return vmla_n_f32(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlaq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]]
+// CHECK:   ret <4 x float> [[ADD_I]]
 float32x4_t test_vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
-  // CHECK-LABEL: test_vmlaq_n_f32
   return vmlaq_n_f32(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmlaq_n_f64(<2 x double> %a, <2 x double> %b, double %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x double> %a, [[MUL_I]]
+// CHECK:   ret <2 x double> [[ADD_I]]
 float64x2_t test_vmlaq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
-  // CHECK-LABEL: test_vmlaq_n_f64
   return vmlaq_n_f64(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-  // CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  // CHECK-FMA: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-  // CHECK-FMA: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlsq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]]
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
-  // CHECK-LABEL: test_vmlsq_n_f32
   return vmlsq_n_f32(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmls_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]]
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
-  // CHECK-LABEL: test_vmls_n_f32
   return vmls_n_f32(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmlsq_n_f64(<2 x double> %a, <2 x double> %b, double %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x double> %a, [[MUL_I]]
+// CHECK:   ret <2 x double> [[SUB_I]]
 float64x2_t test_vmlsq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
-  // CHECK-LABEL: test_vmlsq_n_f64
   return vmlsq_n_f64(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-  // CHECK: fsub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  // CHECK-FMA: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-  // CHECK-FMA: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmla_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[ADD]]
 float32x2_t test_vmla_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmla_lane_f32_0
   return vmla_lane_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_f32_0
   return vmlaq_lane_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmla_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[ADD]]
 float32x2_t test_vmla_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmla_laneq_f32_0
   return vmla_laneq_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_f32_0
   return vmlaq_laneq_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmls_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[SUB]]
 float32x2_t test_vmls_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmls_lane_f32_0
   return vmls_lane_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_f32_0
   return vmlsq_lane_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmls_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[SUB]]
 float32x2_t test_vmls_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmls_laneq_f32_0
   return vmls_laneq_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_f32_0
   return vmlsq_laneq_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmla_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[ADD]]
 float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmla_lane_f32
   return vmla_lane_f32(a, b, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_f32
   return vmlaq_lane_f32(a, b, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmla_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[ADD]]
 float32x2_t test_vmla_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmla_laneq_f32
   return vmla_laneq_f32(a, b, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_f32
   return vmlaq_laneq_f32(a, b, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmls_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[SUB]]
 float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmls_lane_f32
   return vmls_lane_f32(a, b, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_f32
   return vmlsq_lane_f32(a, b, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
+// CHECK-LABEL: define <2 x float> @test_vmls_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[SUB]]
 float32x2_t test_vmls_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmls_laneq_f32
   return vmls_laneq_f32(a, b, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_f32
   return vmlsq_laneq_f32(a, b, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmaq_n_f64(<2 x double> %a, <2 x double> %b, double %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x double> [[TMP3]]) #2
+// CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmaq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
-  // CHECK-LABEL: test_vfmaq_n_f64:
   return vfmaq_n_f64(a, b, c);
-  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+\.2d|v[0-9]+\.d\[0\]}}
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmsq_n_f64(<2 x double> %a, <2 x double> %b, double %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, [[TMP3]]
+// CHECK:   [[FMLS_I_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[FMLS1_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[FMLS2_I_I:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLS_I_I]], <2 x double> [[TMP4]], <2 x double> [[FMLS1_I_I]]) #2
+// CHECK:   ret <2 x double> [[FMLS2_I_I]]
 float64x2_t test_vfmsq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
-  // CHECK-LABEL: test_vfmsq_n_f64:
   return vfmsq_n_f64(a, b, c);
-  // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+\.2d|v[0-9]+\.d\[0\]}}
 }




More information about the cfe-commits mailing list