[clang] 42c7bc0 - [AArch64][ARM] Make neon fp16 generic intrinsics always available. (#87467)

Wed Apr 3 11:10:18 PDT 2024

Author: David Green
Date: 2024-04-03T19:10:14+01:00
New Revision: 42c7bc04c30b427414a2d957776b1655abb27b6e

URL: https://github.com/llvm/llvm-project/commit/42c7bc04c30b427414a2d957776b1655abb27b6e
DIFF: https://github.com/llvm/llvm-project/commit/42c7bc04c30b427414a2d957776b1655abb27b6e.diff

LOG: [AArch64][ARM] Make neon fp16 generic intrinsics always available. (#87467)

By generic intrinsics this mean things like dup, ext, zip and bsl that
can always be executed with integer s16 operations and do not require
fullfp16. This makes them always available, and brings them inline with
GCC.
https://godbolt.org/z/azs8eMv54

The relevant test cases have been moved into their own files, to allow
them to be tested with armv8-a and armv8.2-a+fp16.

Added: 
    clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-generic.c
    clang/test/CodeGen/arm-v8.2a-neon-intrinsics-generic.c

Modified: 
    clang/include/clang/Basic/arm_neon.td
    clang/lib/CodeGen/CGBuiltin.cpp
    clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
    clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index f16de97f4e6bda..7edac5afafaa99 100644

--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -1758,24 +1758,21 @@ let TargetGuard = "fullfp16" in {
   // Mul lane
   def VMUL_LANEH    : IOpInst<"vmul_lane", "..qI", "hQh", OP_MUL_LN>;
   def VMUL_NH       : IOpInst<"vmul_n", "..1", "hQh", OP_MUL_N>;
+}
 
-  // Data processing intrinsics - section 5
-
-  // Logical operations
-  let isHiddenLInst = 1 in
-  def VBSLH    : SInst<"vbsl", ".U..", "hQh">;
-
-  // Transposition operations
-  def VZIPH    : WInst<"vzip", "2..", "hQh">;
-  def VUZPH    : WInst<"vuzp", "2..", "hQh">;
-  def VTRNH    : WInst<"vtrn", "2..", "hQh">;
-
-  // Vector Extract
-  def VEXTH      : WInst<"vext", "...I", "hQh">;
+// Data processing intrinsics - section 5. Do not require fullfp16.
 
-  // Reverse vector elements
-  def VREV64H    : WOpInst<"vrev64", "..", "hQh", OP_REV64>;
-}
+// Logical operations
+let isHiddenLInst = 1 in
+def VBSLH    : SInst<"vbsl", ".U..", "hQh">;
+// Transposition operations
+def VZIPH    : WInst<"vzip", "2..", "hQh">;
+def VUZPH    : WInst<"vuzp", "2..", "hQh">;
+def VTRNH    : WInst<"vtrn", "2..", "hQh">;
+// Vector Extract
+def VEXTH      : WInst<"vext", "...I", "hQh">;
+// Reverse vector elements
+def VREV64H    : WOpInst<"vrev64", "..", "hQh", OP_REV64>;
 
 // ARMv8.2-A FP16 vector intrinsics for A64 only.
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "fullfp16" in {
@@ -1857,7 +1854,9 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "fullfp16" in {
   def VMINVH   : SInst<"vminv", "1.", "hQh">;
   def FMAXNMVH : SInst<"vmaxnmv", "1.", "hQh">;
   def FMINNMVH : SInst<"vminnmv", "1.", "hQh">;
+}
 
+let ArchGuard = "defined(__aarch64__)" in {
   // Permutation
   def VTRN1H     : SOpInst<"vtrn1", "...", "hQh", OP_TRN1>;
   def VZIP1H     : SOpInst<"vzip1", "...", "hQh", OP_ZIP1>;

diff  --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 483f9c26859923..2537e715b63ee4 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -7281,8 +7281,6 @@ static const std::pair<unsigned, unsigned> NEONEquivalentIntrinsicMap[] = {
   { NEON::BI__builtin_neon_vabdq_f16, NEON::BI__builtin_neon_vabdq_v, },
   { NEON::BI__builtin_neon_vabs_f16, NEON::BI__builtin_neon_vabs_v, },
   { NEON::BI__builtin_neon_vabsq_f16, NEON::BI__builtin_neon_vabsq_v, },
-  { NEON::BI__builtin_neon_vbsl_f16, NEON::BI__builtin_neon_vbsl_v, },
-  { NEON::BI__builtin_neon_vbslq_f16, NEON::BI__builtin_neon_vbslq_v, },
   { NEON::BI__builtin_neon_vcage_f16, NEON::BI__builtin_neon_vcage_v, },
   { NEON::BI__builtin_neon_vcageq_f16, NEON::BI__builtin_neon_vcageq_v, },
   { NEON::BI__builtin_neon_vcagt_f16, NEON::BI__builtin_neon_vcagt_v, },
@@ -7301,8 +7299,6 @@ static const std::pair<unsigned, unsigned> NEONEquivalentIntrinsicMap[] = {
   { NEON::BI__builtin_neon_vclezq_f16, NEON::BI__builtin_neon_vclezq_v, },
   { NEON::BI__builtin_neon_vcltz_f16, NEON::BI__builtin_neon_vcltz_v, },
   { NEON::BI__builtin_neon_vcltzq_f16, NEON::BI__builtin_neon_vcltzq_v, },
-  { NEON::BI__builtin_neon_vext_f16, NEON::BI__builtin_neon_vext_v, },
-  { NEON::BI__builtin_neon_vextq_f16, NEON::BI__builtin_neon_vextq_v, },
   { NEON::BI__builtin_neon_vfma_f16, NEON::BI__builtin_neon_vfma_v, },
   { NEON::BI__builtin_neon_vfma_lane_f16, NEON::BI__builtin_neon_vfma_lane_v, },
   { NEON::BI__builtin_neon_vfma_laneq_f16, NEON::BI__builtin_neon_vfma_laneq_v, },
@@ -7405,12 +7401,6 @@ static const std::pair<unsigned, unsigned> NEONEquivalentIntrinsicMap[] = {
   { NEON::BI__builtin_neon_vst4_lane_bf16, NEON::BI__builtin_neon_vst4_lane_v },
   { NEON::BI__builtin_neon_vst4q_bf16, NEON::BI__builtin_neon_vst4q_v },
   { NEON::BI__builtin_neon_vst4q_lane_bf16, NEON::BI__builtin_neon_vst4q_lane_v },
-  { NEON::BI__builtin_neon_vtrn_f16, NEON::BI__builtin_neon_vtrn_v, },
-  { NEON::BI__builtin_neon_vtrnq_f16, NEON::BI__builtin_neon_vtrnq_v, },
-  { NEON::BI__builtin_neon_vuzp_f16, NEON::BI__builtin_neon_vuzp_v, },
-  { NEON::BI__builtin_neon_vuzpq_f16, NEON::BI__builtin_neon_vuzpq_v, },
-  { NEON::BI__builtin_neon_vzip_f16, NEON::BI__builtin_neon_vzip_v, },
-  { NEON::BI__builtin_neon_vzipq_f16, NEON::BI__builtin_neon_vzipq_v, },
   // The mangling rules cause us to have one ID for each type for vldap1(q)_lane
   // and vstl1(q)_lane, but codegen is equivalent for all of them. Choose an
   // arbitrary one to be handled as tha canonical variation.

diff  --git a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-generic.c b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-generic.c
new file mode 100644
index 00000000000000..78391808fafae3
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-generic.c
@@ -0,0 +1,485 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature -fullfp16 -target-feature +v8a\
+// RUN: -flax-vector-conversions=none -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: | opt -S -passes=mem2reg \
+// RUN: | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.2a\
+// RUN: -flax-vector-conversions=none -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: | opt -S -passes=mem2reg \
+// RUN: | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: define {{[^@]+}}@test_vbsl_f16
+// CHECK-SAME: (<4 x i16> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <4 x i16> [[A]], [[VBSL1_I]]
+// CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i16> [[A]], <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK-NEXT:    [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[VBSL5_I]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP4]]
+//
+float16x4_t test_vbsl_f16(uint16x4_t a, float16x4_t b, float16x4_t c) {
+  return vbsl_f16(a, b, c);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vbslq_f16
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8>
+// CHECK-NEXT:    [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <8 x i16> [[A]], [[VBSL1_I]]
+// CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i16> [[A]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK-NEXT:    [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[VBSL5_I]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
+float16x8_t test_vbslq_f16(uint16x8_t a, float16x8_t b, float16x8_t c) {
+  return vbslq_f16(a, b, c);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vzip_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:    store <4 x half> [[VZIP_I]], ptr [[RETVAL_I]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x half>, ptr [[RETVAL_I]], i32 1
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:    store <4 x half> [[VZIP1_I]], ptr [[TMP2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL_I]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X4X2_T]] [[TMP3]], 0
+// CHECK-NEXT:    store [2 x <4 x half>] [[TMP5]], ptr [[TMP4]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X2_T]] [[TMP6]]
+//
+float16x4x2_t test_vzip_f16(float16x4_t a, float16x4_t b) {
+  return vzip_f16(a, b);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vzipq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:    store <8 x half> [[VZIP_I]], ptr [[RETVAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x half>, ptr [[RETVAL_I]], i32 1
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:    store <8 x half> [[VZIP1_I]], ptr [[TMP2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T]] [[TMP3]], 0
+// CHECK-NEXT:    store [2 x <8 x half>] [[TMP5]], ptr [[TMP4]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X2_T]] [[TMP6]]
+//
+float16x8x2_t test_vzipq_f16(float16x8_t a, float16x8_t b) {
+  return vzipq_f16(a, b);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vuzp_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:    store <4 x half> [[VUZP_I]], ptr [[RETVAL_I]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x half>, ptr [[RETVAL_I]], i32 1
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:    store <4 x half> [[VUZP1_I]], ptr [[TMP2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL_I]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X4X2_T]] [[TMP3]], 0
+// CHECK-NEXT:    store [2 x <4 x half>] [[TMP5]], ptr [[TMP4]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X2_T]] [[TMP6]]
+//
+float16x4x2_t test_vuzp_f16(float16x4_t a, float16x4_t b) {
+  return vuzp_f16(a, b);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vuzpq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:    store <8 x half> [[VUZP_I]], ptr [[RETVAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x half>, ptr [[RETVAL_I]], i32 1
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:    store <8 x half> [[VUZP1_I]], ptr [[TMP2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T]] [[TMP3]], 0
+// CHECK-NEXT:    store [2 x <8 x half>] [[TMP5]], ptr [[TMP4]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X2_T]] [[TMP6]]
+//
+float16x8x2_t test_vuzpq_f16(float16x8_t a, float16x8_t b) {
+  return vuzpq_f16(a, b);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vtrn_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:    store <4 x half> [[VTRN_I]], ptr [[RETVAL_I]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x half>, ptr [[RETVAL_I]], i32 1
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:    store <4 x half> [[VTRN1_I]], ptr [[TMP2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL_I]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X4X2_T]] [[TMP3]], 0
+// CHECK-NEXT:    store [2 x <4 x half>] [[TMP5]], ptr [[TMP4]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X2_T]] [[TMP6]]
+//
+float16x4x2_t test_vtrn_f16(float16x4_t a, float16x4_t b) {
+  return vtrn_f16(a, b);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vtrnq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:    store <8 x half> [[VTRN_I]], ptr [[RETVAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x half>, ptr [[RETVAL_I]], i32 1
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:    store <8 x half> [[VTRN1_I]], ptr [[TMP2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T]] [[TMP3]], 0
+// CHECK-NEXT:    store [2 x <8 x half>] [[TMP5]], ptr [[TMP4]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X2_T]] [[TMP6]]
+//
+float16x8x2_t test_vtrnq_f16(float16x8_t a, float16x8_t b) {
+  return vtrnq_f16(a, b);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vmov_n_f16
+// CHECK-SAME: (half noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[A]], i32 3
+// CHECK-NEXT:    ret <4 x half> [[VECINIT3]]
+//
+float16x4_t test_vmov_n_f16(float16_t a) {
+  return vmov_n_f16(a);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vmovq_n_f16
+// CHECK-SAME: (half noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[A]], i32 4
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[A]], i32 5
+// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[A]], i32 6
+// CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[A]], i32 7
+// CHECK-NEXT:    ret <8 x half> [[VECINIT7]]
+//
+float16x8_t test_vmovq_n_f16(float16_t a) {
+  return vmovq_n_f16(a);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vdup_n_f16
+// CHECK-SAME: (half noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[A]], i32 3
+// CHECK-NEXT:    ret <4 x half> [[VECINIT3]]
+//
+float16x4_t test_vdup_n_f16(float16_t a) {
+  return vdup_n_f16(a);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vdupq_n_f16
+// CHECK-SAME: (half noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[A]], i32 4
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[A]], i32 5
+// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[A]], i32 6
+// CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[A]], i32 7
+// CHECK-NEXT:    ret <8 x half> [[VECINIT7]]
+//
+float16x8_t test_vdupq_n_f16(float16_t a) {
+  return vdupq_n_f16(a);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vdup_lane_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    ret <4 x half> [[LANE]]
+//
+float16x4_t test_vdup_lane_f16(float16x4_t a) {
+  return vdup_lane_f16(a, 3);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vdupq_lane_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    ret <8 x half> [[LANE]]
+//
+float16x8_t test_vdupq_lane_f16(float16x4_t a) {
+  return vdupq_lane_f16(a, 3);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vdup_laneq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    ret <4 x half> [[LANE]]
+//
+float16x4_t test_vdup_laneq_f16(float16x8_t a) {
+  return vdup_laneq_f16(a, 1);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vdupq_laneq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    ret <8 x half> [[LANE]]
+//
+float16x8_t test_vdupq_laneq_f16(float16x8_t a) {
+  return vdupq_laneq_f16(a, 7);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vext_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> [[TMP3]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+// CHECK-NEXT:    ret <4 x half> [[VEXT]]
+//
+float16x4_t test_vext_f16(float16x4_t a, float16x4_t b) {
+  return vext_f16(a, b, 2);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vextq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <8 x half> [[TMP2]], <8 x half> [[TMP3]], <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
+// CHECK-NEXT:    ret <8 x half> [[VEXT]]
+//
+float16x8_t test_vextq_f16(float16x8_t a, float16x8_t b) {
+  return vextq_f16(a, b, 5);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vrev64_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[A]], <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    ret <4 x half> [[SHUFFLE_I]]
+//
+float16x4_t test_vrev64_f16(float16x4_t a) {
+  return vrev64_f16(a);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vrev64q_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[A]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:    ret <8 x half> [[SHUFFLE_I]]
+//
+float16x8_t test_vrev64q_f16(float16x8_t a) {
+  return vrev64q_f16(a);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vzip1_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:    ret <4 x half> [[SHUFFLE_I]]
+//
+float16x4_t test_vzip1_f16(float16x4_t a, float16x4_t b) {
+  return vzip1_f16(a, b);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vzip1q_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:    ret <8 x half> [[SHUFFLE_I]]
+//
+float16x8_t test_vzip1q_f16(float16x8_t a, float16x8_t b) {
+  return vzip1q_f16(a, b);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vzip2_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:    ret <4 x half> [[SHUFFLE_I]]
+//
+float16x4_t test_vzip2_f16(float16x4_t a, float16x4_t b) {
+  return vzip2_f16(a, b);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vzip2q_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:    ret <8 x half> [[SHUFFLE_I]]
+//
+float16x8_t test_vzip2q_f16(float16x8_t a, float16x8_t b) {
+  return vzip2q_f16(a, b);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vuzp1_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:    ret <4 x half> [[SHUFFLE_I]]
+//
+float16x4_t test_vuzp1_f16(float16x4_t a, float16x4_t b) {
+  return vuzp1_f16(a, b);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vuzp1q_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:    ret <8 x half> [[SHUFFLE_I]]
+//
+float16x8_t test_vuzp1q_f16(float16x8_t a, float16x8_t b) {
+  return vuzp1q_f16(a, b);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vuzp2_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:    ret <4 x half> [[SHUFFLE_I]]
+//
+float16x4_t test_vuzp2_f16(float16x4_t a, float16x4_t b) {
+  return vuzp2_f16(a, b);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vuzp2q_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:    ret <8 x half> [[SHUFFLE_I]]
+//
+float16x8_t test_vuzp2q_f16(float16x8_t a, float16x8_t b) {
+  return vuzp2q_f16(a, b);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vtrn1_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:    ret <4 x half> [[SHUFFLE_I]]
+//
+float16x4_t test_vtrn1_f16(float16x4_t a, float16x4_t b) {
+  return vtrn1_f16(a, b);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vtrn1q_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:    ret <8 x half> [[SHUFFLE_I]]
+//
+float16x8_t test_vtrn1q_f16(float16x8_t a, float16x8_t b) {
+  return vtrn1q_f16(a, b);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vtrn2_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:    ret <4 x half> [[SHUFFLE_I]]
+//
+float16x4_t test_vtrn2_f16(float16x4_t a, float16x4_t b) {
+  return vtrn2_f16(a, b);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vtrn2q_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:    ret <8 x half> [[SHUFFLE_I]]
+//
+float16x8_t test_vtrn2q_f16(float16x8_t a, float16x8_t b) {
+  return vtrn2q_f16(a, b);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vduph_laneq_f16
+// CHECK-SAME: (<8 x half> noundef [[VEC:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x half> [[VEC]], i32 7
+// CHECK-NEXT:    ret half [[VGETQ_LANE]]
+//
+float16_t test_vduph_laneq_f16(float16x8_t vec) {
+  return vduph_laneq_f16(vec, 7);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vduph_lane_f16
+// CHECK-SAME: (<4 x half> noundef [[VEC:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x half> [[VEC]], i32 3
+// CHECK-NEXT:    ret half [[VGET_LANE]]
+//
+float16_t test_vduph_lane_f16(float16x4_t vec) {
+  return vduph_lane_f16(vec, 3);
+}

diff  --git a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
index 4163e6e0918f19..617d515504fe2b 100644
--- a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
@@ -2004,475 +2004,3 @@ float16_t test_vminnmv_f16(float16x4_t a) {
 float16_t test_vminnmvq_f16(float16x8_t a) {
   return vminnmvq_f16(a);
 }
-
-// CHECK-LABEL: define {{[^@]+}}@test_vbsl_f16
-// CHECK-SAME: (<4 x i16> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <4 x i16> [[A]], [[VBSL1_I]]
-// CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i16> [[A]], <i16 -1, i16 -1, i16 -1, i16 -1>
-// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]]
-// CHECK-NEXT:    [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[VBSL5_I]] to <4 x half>
-// CHECK-NEXT:    ret <4 x half> [[TMP4]]
-//
-float16x4_t test_vbsl_f16(uint16x4_t a, float16x4_t b, float16x4_t c) {
-  return vbsl_f16(a, b, c);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vbslq_f16
-// CHECK-SAME: (<8 x i16> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8>
-// CHECK-NEXT:    [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
-// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <8 x i16> [[A]], [[VBSL1_I]]
-// CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i16> [[A]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
-// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]]
-// CHECK-NEXT:    [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[VBSL5_I]] to <8 x half>
-// CHECK-NEXT:    ret <8 x half> [[TMP4]]
-//
-float16x8_t test_vbslq_f16(uint16x8_t a, float16x8_t b, float16x8_t c) {
-  return vbslq_f16(a, b, c);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vzip_f16
-// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK-NEXT:    store <4 x half> [[VZIP_I]], ptr [[RETVAL_I]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x half>, ptr [[RETVAL_I]], i32 1
-// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK-NEXT:    store <4 x half> [[VZIP1_I]], ptr [[TMP2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL_I]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X4X2_T]] [[TMP3]], 0
-// CHECK-NEXT:    store [2 x <4 x half>] [[TMP5]], ptr [[TMP4]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X2_T]] [[TMP6]]
-//
-float16x4x2_t test_vzip_f16(float16x4_t a, float16x4_t b) {
-  return vzip_f16(a, b);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vzipq_f16
-// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK-NEXT:    store <8 x half> [[VZIP_I]], ptr [[RETVAL_I]], align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x half>, ptr [[RETVAL_I]], i32 1
-// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK-NEXT:    store <8 x half> [[VZIP1_I]], ptr [[TMP2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL_I]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T]] [[TMP3]], 0
-// CHECK-NEXT:    store [2 x <8 x half>] [[TMP5]], ptr [[TMP4]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X2_T]] [[TMP6]]
-//
-float16x8x2_t test_vzipq_f16(float16x8_t a, float16x8_t b) {
-  return vzipq_f16(a, b);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vuzp_f16
-// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK-NEXT:    store <4 x half> [[VUZP_I]], ptr [[RETVAL_I]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x half>, ptr [[RETVAL_I]], i32 1
-// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK-NEXT:    store <4 x half> [[VUZP1_I]], ptr [[TMP2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL_I]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X4X2_T]] [[TMP3]], 0
-// CHECK-NEXT:    store [2 x <4 x half>] [[TMP5]], ptr [[TMP4]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X2_T]] [[TMP6]]
-//
-float16x4x2_t test_vuzp_f16(float16x4_t a, float16x4_t b) {
-  return vuzp_f16(a, b);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vuzpq_f16
-// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK-NEXT:    store <8 x half> [[VUZP_I]], ptr [[RETVAL_I]], align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x half>, ptr [[RETVAL_I]], i32 1
-// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK-NEXT:    store <8 x half> [[VUZP1_I]], ptr [[TMP2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL_I]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T]] [[TMP3]], 0
-// CHECK-NEXT:    store [2 x <8 x half>] [[TMP5]], ptr [[TMP4]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X2_T]] [[TMP6]]
-//
-float16x8x2_t test_vuzpq_f16(float16x8_t a, float16x8_t b) {
-  return vuzpq_f16(a, b);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vtrn_f16
-// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK-NEXT:    store <4 x half> [[VTRN_I]], ptr [[RETVAL_I]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x half>, ptr [[RETVAL_I]], i32 1
-// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK-NEXT:    store <4 x half> [[VTRN1_I]], ptr [[TMP2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL_I]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X4X2_T]] [[TMP3]], 0
-// CHECK-NEXT:    store [2 x <4 x half>] [[TMP5]], ptr [[TMP4]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X2_T]] [[TMP6]]
-//
-float16x4x2_t test_vtrn_f16(float16x4_t a, float16x4_t b) {
-  return vtrn_f16(a, b);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vtrnq_f16
-// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK-NEXT:    store <8 x half> [[VTRN_I]], ptr [[RETVAL_I]], align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x half>, ptr [[RETVAL_I]], i32 1
-// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK-NEXT:    store <8 x half> [[VTRN1_I]], ptr [[TMP2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL_I]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T]] [[TMP3]], 0
-// CHECK-NEXT:    store [2 x <8 x half>] [[TMP5]], ptr [[TMP4]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X2_T]] [[TMP6]]
-//
-float16x8x2_t test_vtrnq_f16(float16x8_t a, float16x8_t b) {
-  return vtrnq_f16(a, b);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vmov_n_f16
-// CHECK-SAME: (half noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[A]], i32 3
-// CHECK-NEXT:    ret <4 x half> [[VECINIT3]]
-//
-float16x4_t test_vmov_n_f16(float16_t a) {
-  return vmov_n_f16(a);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vmovq_n_f16
-// CHECK-SAME: (half noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[A]], i32 3
-// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[A]], i32 4
-// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[A]], i32 5
-// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[A]], i32 6
-// CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[A]], i32 7
-// CHECK-NEXT:    ret <8 x half> [[VECINIT7]]
-//
-float16x8_t test_vmovq_n_f16(float16_t a) {
-  return vmovq_n_f16(a);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vdup_n_f16
-// CHECK-SAME: (half noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[A]], i32 3
-// CHECK-NEXT:    ret <4 x half> [[VECINIT3]]
-//
-float16x4_t test_vdup_n_f16(float16_t a) {
-  return vdup_n_f16(a);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vdupq_n_f16
-// CHECK-SAME: (half noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[A]], i32 3
-// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[A]], i32 4
-// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[A]], i32 5
-// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[A]], i32 6
-// CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[A]], i32 7
-// CHECK-NEXT:    ret <8 x half> [[VECINIT7]]
-//
-float16x8_t test_vdupq_n_f16(float16_t a) {
-  return vdupq_n_f16(a);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vdup_lane_f16
-// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    ret <4 x half> [[LANE]]
-//
-float16x4_t test_vdup_lane_f16(float16x4_t a) {
-  return vdup_lane_f16(a, 3);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vdupq_lane_f16
-// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    ret <8 x half> [[LANE]]
-//
-float16x8_t test_vdupq_lane_f16(float16x4_t a) {
-  return vdupq_lane_f16(a, 3);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vdup_laneq_f16
-// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-NEXT:    ret <4 x half> [[LANE]]
-//
-float16x4_t test_vdup_laneq_f16(float16x8_t a) {
-  return vdup_laneq_f16(a, 1);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vdupq_laneq_f16
-// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    ret <8 x half> [[LANE]]
-//
-float16x8_t test_vdupq_laneq_f16(float16x8_t a) {
-  return vdupq_laneq_f16(a, 7);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vext_f16
-// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> [[TMP3]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-// CHECK-NEXT:    ret <4 x half> [[VEXT]]
-//
-float16x4_t test_vext_f16(float16x4_t a, float16x4_t b) {
-  return vext_f16(a, b, 2);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vextq_f16
-// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <8 x half> [[TMP2]], <8 x half> [[TMP3]], <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
-// CHECK-NEXT:    ret <8 x half> [[VEXT]]
-//
-float16x8_t test_vextq_f16(float16x8_t a, float16x8_t b) {
-  return vextq_f16(a, b, 5);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vrev64_f16
-// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[A]], <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-// CHECK-NEXT:    ret <4 x half> [[SHUFFLE_I]]
-//
-float16x4_t test_vrev64_f16(float16x4_t a) {
-  return vrev64_f16(a);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vrev64q_f16
-// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[A]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-// CHECK-NEXT:    ret <8 x half> [[SHUFFLE_I]]
-//
-float16x8_t test_vrev64q_f16(float16x8_t a) {
-  return vrev64q_f16(a);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vzip1_f16
-// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK-NEXT:    ret <4 x half> [[SHUFFLE_I]]
-//
-float16x4_t test_vzip1_f16(float16x4_t a, float16x4_t b) {
-  return vzip1_f16(a, b);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vzip1q_f16
-// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK-NEXT:    ret <8 x half> [[SHUFFLE_I]]
-//
-float16x8_t test_vzip1q_f16(float16x8_t a, float16x8_t b) {
-  return vzip1q_f16(a, b);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vzip2_f16
-// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK-NEXT:    ret <4 x half> [[SHUFFLE_I]]
-//
-float16x4_t test_vzip2_f16(float16x4_t a, float16x4_t b) {
-  return vzip2_f16(a, b);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vzip2q_f16
-// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK-NEXT:    ret <8 x half> [[SHUFFLE_I]]
-//
-float16x8_t test_vzip2q_f16(float16x8_t a, float16x8_t b) {
-  return vzip2q_f16(a, b);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vuzp1_f16
-// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK-NEXT:    ret <4 x half> [[SHUFFLE_I]]
-//
-float16x4_t test_vuzp1_f16(float16x4_t a, float16x4_t b) {
-  return vuzp1_f16(a, b);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vuzp1q_f16
-// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK-NEXT:    ret <8 x half> [[SHUFFLE_I]]
-//
-float16x8_t test_vuzp1q_f16(float16x8_t a, float16x8_t b) {
-  return vuzp1q_f16(a, b);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vuzp2_f16
-// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK-NEXT:    ret <4 x half> [[SHUFFLE_I]]
-//
-float16x4_t test_vuzp2_f16(float16x4_t a, float16x4_t b) {
-  return vuzp2_f16(a, b);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vuzp2q_f16
-// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK-NEXT:    ret <8 x half> [[SHUFFLE_I]]
-//
-float16x8_t test_vuzp2q_f16(float16x8_t a, float16x8_t b) {
-  return vuzp2q_f16(a, b);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vtrn1_f16
-// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK-NEXT:    ret <4 x half> [[SHUFFLE_I]]
-//
-float16x4_t test_vtrn1_f16(float16x4_t a, float16x4_t b) {
-  return vtrn1_f16(a, b);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vtrn1q_f16
-// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK-NEXT:    ret <8 x half> [[SHUFFLE_I]]
-//
-float16x8_t test_vtrn1q_f16(float16x8_t a, float16x8_t b) {
-  return vtrn1q_f16(a, b);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vtrn2_f16
-// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK-NEXT:    ret <4 x half> [[SHUFFLE_I]]
-//
-float16x4_t test_vtrn2_f16(float16x4_t a, float16x4_t b) {
-  return vtrn2_f16(a, b);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vtrn2q_f16
-// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK-NEXT:    ret <8 x half> [[SHUFFLE_I]]
-//
-float16x8_t test_vtrn2q_f16(float16x8_t a, float16x8_t b) {
-  return vtrn2q_f16(a, b);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vduph_laneq_f16
-// CHECK-SAME: (<8 x half> noundef [[VEC:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x half> [[VEC]], i32 7
-// CHECK-NEXT:    ret half [[VGETQ_LANE]]
-//
-float16_t test_vduph_laneq_f16(float16x8_t vec) {
-  return vduph_laneq_f16(vec, 7);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vduph_lane_f16
-// CHECK-SAME: (<4 x half> noundef [[VEC:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x half> [[VEC]], i32 3
-// CHECK-NEXT:    ret half [[VGET_LANE]]
-//
-float16_t test_vduph_lane_f16(float16x4_t vec) {
-  return vduph_lane_f16(vec, 3);
-}

diff  --git a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics-generic.c b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics-generic.c
new file mode 100644
index 00000000000000..f8d83332ab01d1
--- /dev/null
+++ b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics-generic.c
@@ -0,0 +1,600 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 -triple armv8.2a-linux-gnu -target-abi apcs-gnu -target-feature +neon -target-feature -fullfp16 \
+// RUN: -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: | opt -S -passes=sroa \
+// RUN: | FileCheck %s --check-prefixes=CHECK-NOFP16
+// RUN: %clang_cc1 -triple armv8a-linux-gnu -target-abi apcs-gnu -target-feature +neon -target-feature +fullfp16 \
+// RUN: -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: | opt -S -passes=sroa \
+// RUN: | FileCheck %s --check-prefixes=CHECK-FP16
+
+// REQUIRES: arm-registered-target
+
+#include <arm_neon.h>
+
+// CHECK-NOFP16-LABEL: define dso_local <2 x i32> @test_vbsl_f16(
+// CHECK-NOFP16-SAME: <4 x i16> noundef [[A:%.*]], <2 x i32> noundef [[B_COERCE:%.*]], <2 x i32> noundef [[C_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NOFP16-NEXT:  entry:
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B_COERCE]] to <4 x half>
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[C_COERCE]] to <4 x half>
+// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[TMP0]] to <2 x i32>
+// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP1]] to <2 x i32>
+// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x half>
+// CHECK-NOFP16-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x half>
+// CHECK-NOFP16-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NOFP16-NEXT:    [[TMP7:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK-NOFP16-NEXT:    [[TMP8:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK-NOFP16-NEXT:    [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> [[TMP8]])
+// CHECK-NOFP16-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x half>
+// CHECK-NOFP16-NEXT:    [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <2 x i32>
+// CHECK-NOFP16-NEXT:    [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <4 x half>
+// CHECK-NOFP16-NEXT:    [[TMP12:%.*]] = bitcast <4 x half> [[TMP11]] to <2 x i32>
+// CHECK-NOFP16-NEXT:    ret <2 x i32> [[TMP12]]
+//
+// CHECK-FP16-LABEL: define dso_local <4 x half> @test_vbsl_f16(
+// CHECK-FP16-SAME: <4 x i16> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-FP16-NEXT:  entry:
+// CHECK-FP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-FP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-FP16-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8>
+// CHECK-FP16-NEXT:    [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
+// CHECK-FP16-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x half>
+// CHECK-FP16-NEXT:    ret <4 x half> [[TMP3]]
+//
+float16x4_t test_vbsl_f16(uint16x4_t a, float16x4_t b, float16x4_t c) {
+  return vbsl_f16(a, b, c);
+}
+
+// CHECK-NOFP16-LABEL: define dso_local <4 x i32> @test_vbslq_f16(
+// CHECK-NOFP16-SAME: <8 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B_COERCE:%.*]], <4 x i32> noundef [[C_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NOFP16-NEXT:  entry:
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B_COERCE]] to <8 x half>
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[C_COERCE]] to <8 x half>
+// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x i32>
+// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP1]] to <4 x i32>
+// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x half>
+// CHECK-NOFP16-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x half>
+// CHECK-NOFP16-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NOFP16-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK-NOFP16-NEXT:    [[TMP8:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK-NOFP16-NEXT:    [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
+// CHECK-NOFP16-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x half>
+// CHECK-NOFP16-NEXT:    [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <4 x i32>
+// CHECK-NOFP16-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <8 x half>
+// CHECK-NOFP16-NEXT:    [[TMP12:%.*]] = bitcast <8 x half> [[TMP11]] to <4 x i32>
+// CHECK-NOFP16-NEXT:    ret <4 x i32> [[TMP12]]
+//
+// CHECK-FP16-LABEL: define dso_local <8 x half> @test_vbslq_f16(
+// CHECK-FP16-SAME: <8 x i16> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-FP16-NEXT:  entry:
+// CHECK-FP16-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-FP16-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-FP16-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8>
+// CHECK-FP16-NEXT:    [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-FP16-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x half>
+// CHECK-FP16-NEXT:    ret <8 x half> [[TMP3]]
+//
+float16x8_t test_vbslq_f16(uint16x8_t a, float16x8_t b, float16x8_t c) {
+  return vbslq_f16(a, b, c);
+}
+
+// CHECK-NOFP16-LABEL: define dso_local void @test_vzip_f16(
+// CHECK-NOFP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A_COERCE:%.*]], <2 x i32> noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NOFP16-NEXT:  entry:
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x half>
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE]] to <4 x half>
+// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[TMP0]] to <2 x i32>
+// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP1]] to <2 x i32>
+// CHECK-NOFP16-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
+// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x half>
+// CHECK-NOFP16-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x half>
+// CHECK-NOFP16-NEXT:    [[TMP6:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK-NOFP16-NEXT:    [[TMP7:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK-NOFP16-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK-NOFP16-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK-NOFP16-NEXT:    [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NOFP16-NEXT:    store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META3]]
+// CHECK-NOFP16-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NOFP16-NEXT:    [[VZIP3_I:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NOFP16-NEXT:    store <4 x i16> [[VZIP3_I]], ptr [[TMP10]], align 4, !alias.scope [[META3]]
+// CHECK-NOFP16-NEXT:    ret void
+//
+// CHECK-FP16-LABEL: define dso_local void @test_vzip_f16(
+// CHECK-FP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-FP16-NEXT:  entry:
+// CHECK-FP16-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
+// CHECK-FP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-FP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-FP16-NEXT:    [[VZIP_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-FP16-NEXT:    store <4 x half> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META3]]
+// CHECK-FP16-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x half>, ptr [[AGG_RESULT]], i32 1
+// CHECK-FP16-NEXT:    [[VZIP1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-FP16-NEXT:    store <4 x half> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META3]]
+// CHECK-FP16-NEXT:    ret void
+//
+float16x4x2_t test_vzip_f16(float16x4_t a, float16x4_t b) {
+  return vzip_f16(a, b);
+}
+
+// CHECK-NOFP16-LABEL: define dso_local void @test_vzipq_f16(
+// CHECK-NOFP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A_COERCE:%.*]], <4 x i32> noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NOFP16-NEXT:  entry:
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A_COERCE]] to <8 x half>
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE]] to <8 x half>
+// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x i32>
+// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP1]] to <4 x i32>
+// CHECK-NOFP16-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]])
+// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x half>
+// CHECK-NOFP16-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x half>
+// CHECK-NOFP16-NEXT:    [[TMP6:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK-NOFP16-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK-NOFP16-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK-NOFP16-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK-NOFP16-NEXT:    [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NOFP16-NEXT:    store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META6]]
+// CHECK-NOFP16-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NOFP16-NEXT:    [[VZIP3_I:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NOFP16-NEXT:    store <8 x i16> [[VZIP3_I]], ptr [[TMP10]], align 4, !alias.scope [[META6]]
+// CHECK-NOFP16-NEXT:    ret void
+//
+// CHECK-FP16-LABEL: define dso_local void @test_vzipq_f16(
+// CHECK-FP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-FP16-NEXT:  entry:
+// CHECK-FP16-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]])
+// CHECK-FP16-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-FP16-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-FP16-NEXT:    [[VZIP_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-FP16-NEXT:    store <8 x half> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META6]]
+// CHECK-FP16-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x half>, ptr [[AGG_RESULT]], i32 1
+// CHECK-FP16-NEXT:    [[VZIP1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-FP16-NEXT:    store <8 x half> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META6]]
+// CHECK-FP16-NEXT:    ret void
+//
+float16x8x2_t test_vzipq_f16(float16x8_t a, float16x8_t b) {
+  return vzipq_f16(a, b);
+}
+
+// CHECK-NOFP16-LABEL: define dso_local void @test_vuzp_f16(
+// CHECK-NOFP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A_COERCE:%.*]], <2 x i32> noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NOFP16-NEXT:  entry:
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x half>
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE]] to <4 x half>
+// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[TMP0]] to <2 x i32>
+// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP1]] to <2 x i32>
+// CHECK-NOFP16-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META9:![0-9]+]])
+// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x half>
+// CHECK-NOFP16-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x half>
+// CHECK-NOFP16-NEXT:    [[TMP6:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK-NOFP16-NEXT:    [[TMP7:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK-NOFP16-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK-NOFP16-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK-NOFP16-NEXT:    [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NOFP16-NEXT:    store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META9]]
+// CHECK-NOFP16-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NOFP16-NEXT:    [[VUZP3_I:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NOFP16-NEXT:    store <4 x i16> [[VUZP3_I]], ptr [[TMP10]], align 4, !alias.scope [[META9]]
+// CHECK-NOFP16-NEXT:    ret void
+//
+// CHECK-FP16-LABEL: define dso_local void @test_vuzp_f16(
+// CHECK-FP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-FP16-NEXT:  entry:
+// CHECK-FP16-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META9:![0-9]+]])
+// CHECK-FP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-FP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-FP16-NEXT:    [[VUZP_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-FP16-NEXT:    store <4 x half> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META9]]
+// CHECK-FP16-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x half>, ptr [[AGG_RESULT]], i32 1
+// CHECK-FP16-NEXT:    [[VUZP1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-FP16-NEXT:    store <4 x half> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META9]]
+// CHECK-FP16-NEXT:    ret void
+//
+float16x4x2_t test_vuzp_f16(float16x4_t a, float16x4_t b) {
+  return vuzp_f16(a, b);
+}
+
+// CHECK-NOFP16-LABEL: define dso_local void @test_vuzpq_f16(
+// CHECK-NOFP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A_COERCE:%.*]], <4 x i32> noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NOFP16-NEXT:  entry:
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A_COERCE]] to <8 x half>
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE]] to <8 x half>
+// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x i32>
+// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP1]] to <4 x i32>
+// CHECK-NOFP16-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]])
+// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x half>
+// CHECK-NOFP16-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x half>
+// CHECK-NOFP16-NEXT:    [[TMP6:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK-NOFP16-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK-NOFP16-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK-NOFP16-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK-NOFP16-NEXT:    [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NOFP16-NEXT:    store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META12]]
+// CHECK-NOFP16-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NOFP16-NEXT:    [[VUZP3_I:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NOFP16-NEXT:    store <8 x i16> [[VUZP3_I]], ptr [[TMP10]], align 4, !alias.scope [[META12]]
+// CHECK-NOFP16-NEXT:    ret void
+//
+// CHECK-FP16-LABEL: define dso_local void @test_vuzpq_f16(
+// CHECK-FP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-FP16-NEXT:  entry:
+// CHECK-FP16-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]])
+// CHECK-FP16-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-FP16-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-FP16-NEXT:    [[VUZP_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-FP16-NEXT:    store <8 x half> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META12]]
+// CHECK-FP16-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x half>, ptr [[AGG_RESULT]], i32 1
+// CHECK-FP16-NEXT:    [[VUZP1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-FP16-NEXT:    store <8 x half> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META12]]
+// CHECK-FP16-NEXT:    ret void
+//
+float16x8x2_t test_vuzpq_f16(float16x8_t a, float16x8_t b) {
+  return vuzpq_f16(a, b);
+}
+
+// CHECK-NOFP16-LABEL: define dso_local void @test_vtrn_f16(
+// CHECK-NOFP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A_COERCE:%.*]], <2 x i32> noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NOFP16-NEXT:  entry:
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x half>
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE]] to <4 x half>
+// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[TMP0]] to <2 x i32>
+// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP1]] to <2 x i32>
+// CHECK-NOFP16-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META15:![0-9]+]])
+// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x half>
+// CHECK-NOFP16-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x half>
+// CHECK-NOFP16-NEXT:    [[TMP6:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK-NOFP16-NEXT:    [[TMP7:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK-NOFP16-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK-NOFP16-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK-NOFP16-NEXT:    [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NOFP16-NEXT:    store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META15]]
+// CHECK-NOFP16-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NOFP16-NEXT:    [[VTRN3_I:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NOFP16-NEXT:    store <4 x i16> [[VTRN3_I]], ptr [[TMP10]], align 4, !alias.scope [[META15]]
+// CHECK-NOFP16-NEXT:    ret void
+//
+// CHECK-FP16-LABEL: define dso_local void @test_vtrn_f16(
+// CHECK-FP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-FP16-NEXT:  entry:
+// CHECK-FP16-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META15:![0-9]+]])
+// CHECK-FP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-FP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-FP16-NEXT:    [[VTRN_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-FP16-NEXT:    store <4 x half> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META15]]
+// CHECK-FP16-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x half>, ptr [[AGG_RESULT]], i32 1
+// CHECK-FP16-NEXT:    [[VTRN1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-FP16-NEXT:    store <4 x half> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META15]]
+// CHECK-FP16-NEXT:    ret void
+//
+float16x4x2_t test_vtrn_f16(float16x4_t a, float16x4_t b) {
+  return vtrn_f16(a, b);
+}
+
+// CHECK-NOFP16-LABEL: define dso_local void @test_vtrnq_f16(
+// CHECK-NOFP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A_COERCE:%.*]], <4 x i32> noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NOFP16-NEXT:  entry:
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A_COERCE]] to <8 x half>
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE]] to <8 x half>
+// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x i32>
+// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP1]] to <4 x i32>
+// CHECK-NOFP16-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]])
+// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x half>
+// CHECK-NOFP16-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x half>
+// CHECK-NOFP16-NEXT:    [[TMP6:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK-NOFP16-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK-NOFP16-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK-NOFP16-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK-NOFP16-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NOFP16-NEXT:    store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META18]]
+// CHECK-NOFP16-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NOFP16-NEXT:    [[VTRN3_I:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NOFP16-NEXT:    store <8 x i16> [[VTRN3_I]], ptr [[TMP10]], align 4, !alias.scope [[META18]]
+// CHECK-NOFP16-NEXT:    ret void
+//
+// CHECK-FP16-LABEL: define dso_local void @test_vtrnq_f16(
+// CHECK-FP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-FP16-NEXT:  entry:
+// CHECK-FP16-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]])
+// CHECK-FP16-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-FP16-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-FP16-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-FP16-NEXT:    store <8 x half> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META18]]
+// CHECK-FP16-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x half>, ptr [[AGG_RESULT]], i32 1
+// CHECK-FP16-NEXT:    [[VTRN1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-FP16-NEXT:    store <8 x half> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META18]]
+// CHECK-FP16-NEXT:    ret void
+//
+float16x8x2_t test_vtrnq_f16(float16x8_t a, float16x8_t b) {
+  return vtrnq_f16(a, b);
+}
+
+// CHECK-NOFP16-LABEL: define dso_local <2 x i32> @test_vmov_n_f16(
+// CHECK-NOFP16-SAME: half noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NOFP16-NEXT:  entry:
+// CHECK-NOFP16-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i32 0
+// CHECK-NOFP16-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[A]], i32 1
+// CHECK-NOFP16-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[A]], i32 2
+// CHECK-NOFP16-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[A]], i32 3
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[VECINIT3]] to <2 x i32>
+// CHECK-NOFP16-NEXT:    ret <2 x i32> [[TMP0]]
+//
+// CHECK-FP16-LABEL: define dso_local <4 x half> @test_vmov_n_f16(
+// CHECK-FP16-SAME: half noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-FP16-NEXT:  entry:
+// CHECK-FP16-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i32 0
+// CHECK-FP16-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[A]], i32 1
+// CHECK-FP16-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[A]], i32 2
+// CHECK-FP16-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[A]], i32 3
+// CHECK-FP16-NEXT:    ret <4 x half> [[VECINIT3]]
+//
+float16x4_t test_vmov_n_f16(float16_t a) {
+  return vmov_n_f16(a);
+}
+
+// CHECK-NOFP16-LABEL: define dso_local <4 x i32> @test_vmovq_n_f16(
+// CHECK-NOFP16-SAME: half noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NOFP16-NEXT:  entry:
+// CHECK-NOFP16-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i32 0
+// CHECK-NOFP16-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[A]], i32 1
+// CHECK-NOFP16-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[A]], i32 2
+// CHECK-NOFP16-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[A]], i32 3
+// CHECK-NOFP16-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[A]], i32 4
+// CHECK-NOFP16-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[A]], i32 5
+// CHECK-NOFP16-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[A]], i32 6
+// CHECK-NOFP16-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[A]], i32 7
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VECINIT7]] to <4 x i32>
+// CHECK-NOFP16-NEXT:    ret <4 x i32> [[TMP0]]
+//
+// CHECK-FP16-LABEL: define dso_local <8 x half> @test_vmovq_n_f16(
+// CHECK-FP16-SAME: half noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-FP16-NEXT:  entry:
+// CHECK-FP16-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i32 0
+// CHECK-FP16-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[A]], i32 1
+// CHECK-FP16-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[A]], i32 2
+// CHECK-FP16-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[A]], i32 3
+// CHECK-FP16-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[A]], i32 4
+// CHECK-FP16-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[A]], i32 5
+// CHECK-FP16-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[A]], i32 6
+// CHECK-FP16-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[A]], i32 7
+// CHECK-FP16-NEXT:    ret <8 x half> [[VECINIT7]]
+//
+float16x8_t test_vmovq_n_f16(float16_t a) {
+  return vmovq_n_f16(a);
+}
+
+// CHECK-NOFP16-LABEL: define dso_local <2 x i32> @test_vdup_n_f16(
+// CHECK-NOFP16-SAME: half noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NOFP16-NEXT:  entry:
+// CHECK-NOFP16-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i32 0
+// CHECK-NOFP16-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[A]], i32 1
+// CHECK-NOFP16-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[A]], i32 2
+// CHECK-NOFP16-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[A]], i32 3
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[VECINIT3]] to <2 x i32>
+// CHECK-NOFP16-NEXT:    ret <2 x i32> [[TMP0]]
+//
+// CHECK-FP16-LABEL: define dso_local <4 x half> @test_vdup_n_f16(
+// CHECK-FP16-SAME: half noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-FP16-NEXT:  entry:
+// CHECK-FP16-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i32 0
+// CHECK-FP16-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[A]], i32 1
+// CHECK-FP16-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[A]], i32 2
+// CHECK-FP16-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[A]], i32 3
+// CHECK-FP16-NEXT:    ret <4 x half> [[VECINIT3]]
+//
+float16x4_t test_vdup_n_f16(float16_t a) {
+  return vdup_n_f16(a);
+}
+
+// CHECK-NOFP16-LABEL: define dso_local <4 x i32> @test_vdupq_n_f16(
+// CHECK-NOFP16-SAME: half noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NOFP16-NEXT:  entry:
+// CHECK-NOFP16-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i32 0
+// CHECK-NOFP16-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[A]], i32 1
+// CHECK-NOFP16-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[A]], i32 2
+// CHECK-NOFP16-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[A]], i32 3
+// CHECK-NOFP16-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[A]], i32 4
+// CHECK-NOFP16-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[A]], i32 5
+// CHECK-NOFP16-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[A]], i32 6
+// CHECK-NOFP16-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[A]], i32 7
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VECINIT7]] to <4 x i32>
+// CHECK-NOFP16-NEXT:    ret <4 x i32> [[TMP0]]
+//
+// CHECK-FP16-LABEL: define dso_local <8 x half> @test_vdupq_n_f16(
+// CHECK-FP16-SAME: half noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-FP16-NEXT:  entry:
+// CHECK-FP16-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i32 0
+// CHECK-FP16-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[A]], i32 1
+// CHECK-FP16-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[A]], i32 2
+// CHECK-FP16-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[A]], i32 3
+// CHECK-FP16-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[A]], i32 4
+// CHECK-FP16-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[A]], i32 5
+// CHECK-FP16-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[A]], i32 6
+// CHECK-FP16-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[A]], i32 7
+// CHECK-FP16-NEXT:    ret <8 x half> [[VECINIT7]]
+//
+float16x8_t test_vdupq_n_f16(float16_t a) {
+  return vdupq_n_f16(a);
+}
+
+// CHECK-NOFP16-LABEL: define dso_local <2 x i32> @test_vdup_lane_f16(
+// CHECK-NOFP16-SAME: <2 x i32> noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NOFP16-NEXT:  entry:
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x half>
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8>
+// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NOFP16-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <4 x half>
+// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <2 x i32>
+// CHECK-NOFP16-NEXT:    ret <2 x i32> [[TMP4]]
+//
+// CHECK-FP16-LABEL: define dso_local <4 x half> @test_vdup_lane_f16(
+// CHECK-FP16-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-FP16-NEXT:  entry:
+// CHECK-FP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-FP16-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-FP16-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-FP16-NEXT:    ret <4 x half> [[LANE]]
+//
+float16x4_t test_vdup_lane_f16(float16x4_t a) {
+  return vdup_lane_f16(a, 3);
+}
+
+// CHECK-NOFP16-LABEL: define dso_local <4 x i32> @test_vdupq_lane_f16(
+// CHECK-NOFP16-SAME: <2 x i32> noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NOFP16-NEXT:  entry:
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x half>
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8>
+// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NOFP16-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[LANE]] to <8 x half>
+// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <4 x i32>
+// CHECK-NOFP16-NEXT:    ret <4 x i32> [[TMP4]]
+//
+// CHECK-FP16-LABEL: define dso_local <8 x half> @test_vdupq_lane_f16(
+// CHECK-FP16-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-FP16-NEXT:  entry:
+// CHECK-FP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-FP16-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-FP16-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-FP16-NEXT:    ret <8 x half> [[LANE]]
+//
+float16x8_t test_vdupq_lane_f16(float16x4_t a) {
+  return vdupq_lane_f16(a, 3);
+}
+
+// CHECK-NOFP16-LABEL: define dso_local <2 x i32> @test_vext_f16(
+// CHECK-NOFP16-SAME: <2 x i32> noundef [[A_COERCE:%.*]], <2 x i32> noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NOFP16-NEXT:  entry:
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x half>
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE]] to <4 x half>
+// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8>
+// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP1]] to <8 x i8>
+// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK-NOFP16-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NOFP16-NEXT:    [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+// CHECK-NOFP16-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[VEXT]] to <4 x half>
+// CHECK-NOFP16-NEXT:    [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <2 x i32>
+// CHECK-NOFP16-NEXT:    ret <2 x i32> [[TMP7]]
+//
+// CHECK-FP16-LABEL: define dso_local <4 x half> @test_vext_f16(
+// CHECK-FP16-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-FP16-NEXT:  entry:
+// CHECK-FP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-FP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-FP16-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-FP16-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
+// CHECK-FP16-NEXT:    [[VEXT:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> [[TMP3]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+// CHECK-FP16-NEXT:    ret <4 x half> [[VEXT]]
+//
+float16x4_t test_vext_f16(float16x4_t a, float16x4_t b) {
+  return vext_f16(a, b, 2);
+}
+
+// CHECK-NOFP16-LABEL: define dso_local <4 x i32> @test_vextq_f16(
+// CHECK-NOFP16-SAME: <4 x i32> noundef [[A_COERCE:%.*]], <4 x i32> noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NOFP16-NEXT:  entry:
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A_COERCE]] to <8 x half>
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE]] to <8 x half>
+// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[TMP0]] to <16 x i8>
+// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP1]] to <16 x i8>
+// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK-NOFP16-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NOFP16-NEXT:    [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
+// CHECK-NOFP16-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[VEXT]] to <8 x half>
+// CHECK-NOFP16-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <4 x i32>
+// CHECK-NOFP16-NEXT:    ret <4 x i32> [[TMP7]]
+//
+// CHECK-FP16-LABEL: define dso_local <8 x half> @test_vextq_f16(
+// CHECK-FP16-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-FP16-NEXT:  entry:
+// CHECK-FP16-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-FP16-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-FP16-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-FP16-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK-FP16-NEXT:    [[VEXT:%.*]] = shufflevector <8 x half> [[TMP2]], <8 x half> [[TMP3]], <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
+// CHECK-FP16-NEXT:    ret <8 x half> [[VEXT]]
+//
+float16x8_t test_vextq_f16(float16x8_t a, float16x8_t b) {
+  return vextq_f16(a, b, 5);
+}
+
+// CHECK-NOFP16-LABEL: define dso_local <2 x i32> @test_vrev64_f16(
+// CHECK-NOFP16-SAME: <2 x i32> noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NOFP16-NEXT:  entry:
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x half>
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <2 x i32>
+// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <4 x half>
+// CHECK-NOFP16-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> [[TMP2]], <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[SHUFFLE_I]] to <2 x i32>
+// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x half>
+// CHECK-NOFP16-NEXT:    [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <2 x i32>
+// CHECK-NOFP16-NEXT:    ret <2 x i32> [[TMP5]]
+//
+// CHECK-FP16-LABEL: define dso_local <4 x half> @test_vrev64_f16(
+// CHECK-FP16-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-FP16-NEXT:  entry:
+// CHECK-FP16-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[A]], <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK-FP16-NEXT:    ret <4 x half> [[SHUFFLE_I]]
+//
+float16x4_t test_vrev64_f16(float16x4_t a) {
+  return vrev64_f16(a);
+}
+
+// CHECK-NOFP16-LABEL: define dso_local <4 x i32> @test_vrev64q_f16(
+// CHECK-NOFP16-SAME: <4 x i32> noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NOFP16-NEXT:  entry:
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A_COERCE]] to <8 x half>
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x i32>
+// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <8 x half>
+// CHECK-NOFP16-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[TMP2]], <8 x half> [[TMP2]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[SHUFFLE_I]] to <4 x i32>
+// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x half>
+// CHECK-NOFP16-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x i32>
+// CHECK-NOFP16-NEXT:    ret <4 x i32> [[TMP5]]
+//
+// CHECK-FP16-LABEL: define dso_local <8 x half> @test_vrev64q_f16(
+// CHECK-FP16-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-FP16-NEXT:  entry:
+// CHECK-FP16-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[A]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-FP16-NEXT:    ret <8 x half> [[SHUFFLE_I]]
+//
+float16x8_t test_vrev64q_f16(float16x8_t a) {
+  return vrev64q_f16(a);
+}
+//.
+// CHECK-NOFP16: [[META3]] = !{[[META4:![0-9]+]]}
+// CHECK-NOFP16: [[META4]] = distinct !{[[META4]], [[META5:![0-9]+]], !"vzip_f16: %agg.result"}
+// CHECK-NOFP16: [[META5]] = distinct !{[[META5]], !"vzip_f16"}
+// CHECK-NOFP16: [[META6]] = !{[[META7:![0-9]+]]}
+// CHECK-NOFP16: [[META7]] = distinct !{[[META7]], [[META8:![0-9]+]], !"vzipq_f16: %agg.result"}
+// CHECK-NOFP16: [[META8]] = distinct !{[[META8]], !"vzipq_f16"}
+// CHECK-NOFP16: [[META9]] = !{[[META10:![0-9]+]]}
+// CHECK-NOFP16: [[META10]] = distinct !{[[META10]], [[META11:![0-9]+]], !"vuzp_f16: %agg.result"}
+// CHECK-NOFP16: [[META11]] = distinct !{[[META11]], !"vuzp_f16"}
+// CHECK-NOFP16: [[META12]] = !{[[META13:![0-9]+]]}
+// CHECK-NOFP16: [[META13]] = distinct !{[[META13]], [[META14:![0-9]+]], !"vuzpq_f16: %agg.result"}
+// CHECK-NOFP16: [[META14]] = distinct !{[[META14]], !"vuzpq_f16"}
+// CHECK-NOFP16: [[META15]] = !{[[META16:![0-9]+]]}
+// CHECK-NOFP16: [[META16]] = distinct !{[[META16]], [[META17:![0-9]+]], !"vtrn_f16: %agg.result"}
+// CHECK-NOFP16: [[META17]] = distinct !{[[META17]], !"vtrn_f16"}
+// CHECK-NOFP16: [[META18]] = !{[[META19:![0-9]+]]}
+// CHECK-NOFP16: [[META19]] = distinct !{[[META19]], [[META20:![0-9]+]], !"vtrnq_f16: %agg.result"}
+// CHECK-NOFP16: [[META20]] = distinct !{[[META20]], !"vtrnq_f16"}
+//.
+// CHECK-FP16: [[META3]] = !{[[META4:![0-9]+]]}
+// CHECK-FP16: [[META4]] = distinct !{[[META4]], [[META5:![0-9]+]], !"vzip_f16: %agg.result"}
+// CHECK-FP16: [[META5]] = distinct !{[[META5]], !"vzip_f16"}
+// CHECK-FP16: [[META6]] = !{[[META7:![0-9]+]]}
+// CHECK-FP16: [[META7]] = distinct !{[[META7]], [[META8:![0-9]+]], !"vzipq_f16: %agg.result"}
+// CHECK-FP16: [[META8]] = distinct !{[[META8]], !"vzipq_f16"}
+// CHECK-FP16: [[META9]] = !{[[META10:![0-9]+]]}
+// CHECK-FP16: [[META10]] = distinct !{[[META10]], [[META11:![0-9]+]], !"vuzp_f16: %agg.result"}
+// CHECK-FP16: [[META11]] = distinct !{[[META11]], !"vuzp_f16"}
+// CHECK-FP16: [[META12]] = !{[[META13:![0-9]+]]}
+// CHECK-FP16: [[META13]] = distinct !{[[META13]], [[META14:![0-9]+]], !"vuzpq_f16: %agg.result"}
+// CHECK-FP16: [[META14]] = distinct !{[[META14]], !"vuzpq_f16"}
+// CHECK-FP16: [[META15]] = !{[[META16:![0-9]+]]}
+// CHECK-FP16: [[META16]] = distinct !{[[META16]], [[META17:![0-9]+]], !"vtrn_f16: %agg.result"}
+// CHECK-FP16: [[META17]] = distinct !{[[META17]], !"vtrn_f16"}
+// CHECK-FP16: [[META18]] = !{[[META19:![0-9]+]]}
+// CHECK-FP16: [[META19]] = distinct !{[[META19]], [[META20:![0-9]+]], !"vtrnq_f16: %agg.result"}
+// CHECK-FP16: [[META20]] = distinct !{[[META20]], !"vtrnq_f16"}
+//.

diff  --git a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c
index 477da3a9e20517..c62d1c9de0cb36 100644
--- a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c
+++ b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c
@@ -817,181 +817,3 @@ float16x4_t test_vmul_n_f16(float16x4_t a, float16_t b) {
 float16x8_t test_vmulq_n_f16(float16x8_t a, float16_t b) {
   return vmulq_n_f16(a, b);
 }
-
-// CHECK-LABEL: test_vbsl_f16
-// CHECK:  [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:  [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// CHECK:  [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8>
-// CHECK:  [[VBSL:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
-// CHECK:  [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL]] to <4 x half>
-// CHECK:  ret <4 x half> [[TMP3]]
-float16x4_t test_vbsl_f16(uint16x4_t a, float16x4_t b, float16x4_t c) {
-  return vbsl_f16(a, b, c);
-}
-
-// CHECK-LABEL: test_vbslq_f16
-// CHECK:  [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:  [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK:  [[TMP2:%.*]] = bitcast <8 x half> %c to <16 x i8>
-// CHECK:  [[VBSL:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-// CHECK:  [[TMP3:%.*]] = bitcast <16 x i8> [[VBSL]] to <8 x half>
-// CHECK:  ret <8 x half> [[TMP3]]
-float16x8_t test_vbslq_f16(uint16x8_t a, float16x8_t b, float16x8_t c) {
-  return vbslq_f16(a, b, c);
-}
-
-// CHECK-LABEL: test_vzip_f16
-// CHECK:  [[VZIP0:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:  store <4 x half> [[VZIP0]], ptr [[addr1:%.*]]
-// CHECK:  [[VZIP1:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:  store <4 x half> [[VZIP1]], ptr [[addr2:%.*]]
-float16x4x2_t test_vzip_f16(float16x4_t a, float16x4_t b) {
-  return vzip_f16(a, b);
-}
-
-// CHECK-LABEL: test_vzipq_f16
-// CHECK:  [[VZIP0:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:  store <8 x half> [[VZIP0]], ptr [[addr1:%.*]]
-// CHECK:  [[VZIP1:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:  store <8 x half> [[VZIP1]], ptr [[addr2:%.*]]
-float16x8x2_t test_vzipq_f16(float16x8_t a, float16x8_t b) {
-  return vzipq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vuzp_f16
-// CHECK:  [[VUZP0:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:  store <4 x half> [[VUZP0]], ptr [[addr1:%.*]]
-// CHECK:  [[VUZP1:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:  store <4 x half> [[VUZP1]], ptr [[addr1:%.*]]
-float16x4x2_t test_vuzp_f16(float16x4_t a, float16x4_t b) {
-  return vuzp_f16(a, b);
-}
-
-// CHECK-LABEL: test_vuzpq_f16
-// CHECK:   [[VUZP0:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x half> [[VUZP0]], ptr [[addr1:%.*]]
-// CHECK:   [[VUZP1:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x half> [[VUZP1]], ptr [[addr2:%.*]]
-float16x8x2_t test_vuzpq_f16(float16x8_t a, float16x8_t b) {
-  return vuzpq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vtrn_f16
-// CHECK:   [[VTRN0:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x half> [[VTRN0]], ptr [[addr1:%.*]]
-// CHECK:   [[VTRN1:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x half> [[VTRN1]], ptr [[addr2:%.*]]
-float16x4x2_t test_vtrn_f16(float16x4_t a, float16x4_t b) {
-  return vtrn_f16(a, b);
-}
-
-// CHECK-LABEL: test_vtrnq_f16
-// CHECK:   [[VTRN0:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x half> [[VTRN0]], ptr [[addr1:%.*]]
-// CHECK:   [[VTRN1:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32>  <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x half> [[VTRN1]], ptr [[addr2:%.*]]
-float16x8x2_t test_vtrnq_f16(float16x8_t a, float16x8_t b) {
-  return vtrnq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vmov_n_f16
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x half> poison, half [[ARG:%.*]], i32 0
-// CHECK:   [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half [[ARG]], i32 1
-// CHECK:   [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half [[ARG]], i32 2
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half [[ARG]], i32 3
-// CHECK:   ret <4 x half> [[TMP3]]
-float16x4_t test_vmov_n_f16(float16_t a) {
-  return vmov_n_f16(a);
-}
-
-// CHECK-LABEL: test_vmovq_n_f16
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x half> poison, half [[ARG:%.*]], i32 0
-// CHECK:   [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half [[ARG]], i32 1
-// CHECK:   [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[ARG]], i32 2
-// CHECK:   [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[ARG]], i32 3
-// CHECK:   [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[ARG]], i32 4
-// CHECK:   [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[ARG]], i32 5
-// CHECK:   [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[ARG]], i32 6
-// CHECK:   [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half [[ARG]], i32 7
-// CHECK:   ret <8 x half> [[TMP7]]
-float16x8_t test_vmovq_n_f16(float16_t a) {
-  return vmovq_n_f16(a);
-}
-
-// CHECK-LABEL: test_vdup_n_f16
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x half> poison, half [[ARG:%.*]], i32 0
-// CHECK:   [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half [[ARG]], i32 1
-// CHECK:   [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half [[ARG]], i32 2
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half [[ARG]], i32 3
-// CHECK:   ret <4 x half> [[TMP3]]
-float16x4_t test_vdup_n_f16(float16_t a) {
-  return vdup_n_f16(a);
-}
-
-// CHECK-LABEL: test_vdupq_n_f16
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x half> poison, half [[ARG:%.*]], i32 0
-// CHECK:   [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half [[ARG]], i32 1
-// CHECK:   [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[ARG]], i32 2
-// CHECK:   [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[ARG]], i32 3
-// CHECK:   [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[ARG]], i32 4
-// CHECK:   [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[ARG]], i32 5
-// CHECK:   [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[ARG]], i32 6
-// CHECK:   [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half [[ARG]], i32 7
-// CHECK:   ret <8 x half> [[TMP7]]
-float16x8_t test_vdupq_n_f16(float16_t a) {
-  return vdupq_n_f16(a);
-}
-
-// CHECK-LABEL: test_vdup_lane_f16
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   ret <4 x half> [[LANE]]
-float16x4_t test_vdup_lane_f16(float16x4_t a) {
-  return vdup_lane_f16(a, 3);
-}
-
-// CHECK-LABEL: test_vdupq_lane_f16
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK:   ret <8 x half> [[LANE]]
-float16x8_t test_vdupq_lane_f16(float16x4_t a) {
-  return vdupq_lane_f16(a, 3);
-}
-
-// CHECK-LABEL: @test_vext_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK:   [[VEXT:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> [[TMP3]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-// CHECK:   ret <4 x half> [[VEXT]]
-float16x4_t test_vext_f16(float16x4_t a, float16x4_t b) {
-  return vext_f16(a, b, 2);
-}
-
-// CHECK-LABEL: @test_vextq_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK:   [[VEXT:%.*]] = shufflevector <8 x half> [[TMP2]], <8 x half> [[TMP3]], <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
-// CHECK:   ret <8 x half> [[VEXT]]
-float16x8_t test_vextq_f16(float16x8_t a, float16x8_t b) {
-  return vextq_f16(a, b, 5);
-}
-
-// CHECK-LABEL: @test_vrev64_f16(
-// CHECK:   [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-// CHECK:   ret <4 x half> [[SHFL]]
-float16x4_t test_vrev64_f16(float16x4_t a) {
-  return vrev64_f16(a);
-}
-
-// CHECK-LABEL: @test_vrev64q_f16(
-// CHECK:   [[SHFL:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-// CHECK:   ret <8 x half> [[SHFL]]
-float16x8_t test_vrev64q_f16(float16x8_t a) {
-  return vrev64q_f16(a);
-}